/*
 * This file implements the construction, copying, and destruction
 * aspects of NumPy's nditer.
 *
 * Copyright (c) 2010-2011 by Mark Wiebe (mwwiebe@gmail.com)
 * The University of British Columbia
 *
 * Copyright (c) 2011 Enthought, Inc
 *
 * See LICENSE.txt for the license.
 */
#define NPY_NO_DEPRECATED_API NPY_API_VERSION

/* Allow this .c file to include nditer_impl.h */
#define NPY_ITERATOR_IMPLEMENTATION_CODE

#include "nditer_impl.h"
#include "arrayobject.h"
#include "array_coercion.h"
#include "templ_common.h"
#include "array_assign.h"
#include "dtype_traversal.h"


/* Internal helper functions private to this file */
static int
npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags);
static int
npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
                        const npy_intp *itershape);
static int
npyiter_calculate_ndim(int nop, PyArrayObject **op_in,
                       int oa_ndim);
static int
npyiter_check_per_op_flags(npy_uint32 flags, npyiter_opitflags *op_itflags);
static int
npyiter_prepare_one_operand(PyArrayObject **op,
                        char **op_dataptr,
                        PyArray_Descr *op_request_dtype,
                        PyArray_Descr** op_dtype,
                        npy_uint32 flags,
                        npy_uint32 op_flags, npyiter_opitflags *op_itflags);
static int
npyiter_prepare_operands(int nop,
                    PyArrayObject **op_in,
                    PyArrayObject **op,
                    char **op_dataptr,
                    PyArray_Descr **op_request_dtypes,
                    PyArray_Descr **op_dtype,
                    npy_uint32 flags,
                    npy_uint32 *op_flags, npyiter_opitflags *op_itflags,
                    npy_int8 *out_maskop);
static int
npyiter_check_casting(int nop, PyArrayObject **op,
                    PyArray_Descr **op_dtype,
                    NPY_CASTING casting,
                    npyiter_opitflags *op_itflags);
static int
npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itflags,
                    char **op_dataptr,
                    const npy_uint32 *op_flags, int **op_axes,
                    npy_intp const *itershape);
static inline int
npyiter_get_op_axis(int axis, npy_bool *reduction_axis);
static void
npyiter_replace_axisdata(
        NpyIter *iter, int iop, PyArrayObject *op,
        int orig_op_ndim, const int *op_axes);
static void
npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags);
static void
npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order);
static void
npyiter_flip_negative_strides(NpyIter *iter);
static void
npyiter_reverse_axis_ordering(NpyIter *iter);
static void
npyiter_find_best_axis_ordering(NpyIter *iter);
static PyArray_Descr *
npyiter_get_common_dtype(int nop, PyArrayObject **op,
                        const npyiter_opitflags *op_itflags, PyArray_Descr **op_dtype,
                        PyArray_Descr **op_request_dtypes,
                        int only_inputs);
static PyArrayObject *
npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                npy_uint32 flags, npyiter_opitflags *op_itflags,
                int op_ndim, npy_intp const *shape,
                PyArray_Descr *op_dtype, const int *op_axes);
static int
npyiter_allocate_arrays(NpyIter *iter,
                        npy_uint32 flags,
                        PyArray_Descr **op_dtype, PyTypeObject *subtype,
                        const npy_uint32 *op_flags, npyiter_opitflags *op_itflags,
                        int **op_axes);
static void
npyiter_get_priority_subtype(int nop, PyArrayObject **op,
                            const npyiter_opitflags *op_itflags,
                            double *subtype_priority, PyTypeObject **subtype);
static int
npyiter_allocate_transfer_functions(NpyIter *iter);


/*NUMPY_API
 * Allocate a new iterator for multiple array objects, and advanced
 * options for controlling the broadcasting, shape, and buffer size.
 */
NPY_NO_EXPORT NpyIter *
NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
                 NPY_ORDER order, NPY_CASTING casting,
                 npy_uint32 *op_flags,
                 PyArray_Descr **op_request_dtypes,
                 int oa_ndim, int **op_axes, npy_intp *itershape,
                 npy_intp buffersize)
{
    npy_uint32 itflags = NPY_ITFLAG_IDENTPERM;
    int idim, ndim;
    int iop;

    /* The iterator being constructed */
    NpyIter *iter;

    /* Per-operand values */
    PyArrayObject **op;
    PyArray_Descr **op_dtype;
    npyiter_opitflags *op_itflags;
    char **op_dataptr;

    npy_int8 *perm;
    NpyIter_BufferData *bufferdata = NULL;
    int any_allocate = 0, any_missing_dtypes = 0, need_subtype = 0;

    /* The subtype for automatically allocated outputs */
    double subtype_priority = NPY_PRIORITY;
    PyTypeObject *subtype = &PyArray_Type;

#if NPY_IT_CONSTRUCTION_TIMING
    npy_intp c_temp,
            c_start,
            c_check_op_axes,
            c_check_global_flags,
            c_calculate_ndim,
            c_malloc,
            c_prepare_operands,
            c_fill_axisdata,
            c_compute_index_strides,
            c_apply_forced_iteration_order,
            c_find_best_axis_ordering,
            c_get_priority_subtype,
            c_find_output_common_dtype,
            c_check_casting,
            c_allocate_arrays,
            c_coalesce_axes,
            c_prepare_buffers;
#endif

    NPY_IT_TIME_POINT(c_start);

    if (nop > NPY_MAXARGS) {
        PyErr_Format(PyExc_ValueError,
            "Cannot construct an iterator with more than %d operands "
            "(%d were requested)", NPY_MAXARGS, nop);
        return NULL;
    }

    /*
     * Before 1.8, if `oa_ndim == 0`, this meant `op_axes != NULL` was an error.
     * With 1.8, `oa_ndim == -1` takes this role, while op_axes in that case
     * enforces a 0-d iterator. Using `oa_ndim == 0` with `op_axes == NULL`
     * is thus an error in 1.13 after deprecation.
     */
    if ((oa_ndim == 0) && (op_axes == NULL)) {
        PyErr_Format(PyExc_ValueError,
            "Using `oa_ndim == 0` when `op_axes` is NULL. "
            "Use `oa_ndim == -1` or the MultiNew "
            "iterator for NumPy <1.8 compatibility");
        return NULL;
    }

    /* Error check 'oa_ndim' and 'op_axes', which must be used together */
    if (!npyiter_check_op_axes(nop, oa_ndim, op_axes, itershape)) {
        return NULL;
    }

    NPY_IT_TIME_POINT(c_check_op_axes);

    /* Check the global iterator flags */
    if (!npyiter_check_global_flags(flags, &itflags)) {
        return NULL;
    }

    NPY_IT_TIME_POINT(c_check_global_flags);

    /* Calculate how many dimensions the iterator should have */
    ndim = npyiter_calculate_ndim(nop, op_in, oa_ndim);

    NPY_IT_TIME_POINT(c_calculate_ndim);

    /* Allocate memory for the iterator */
    iter = (NpyIter*)
                PyObject_Malloc(NIT_SIZEOF_ITERATOR(itflags, ndim, nop));

    NPY_IT_TIME_POINT(c_malloc);

    /* Fill in the basic data */
    NIT_ITFLAGS(iter) = itflags;
    NIT_NDIM(iter) = ndim;
    NIT_NOP(iter) = nop;
    NIT_MASKOP(iter) = -1;
    NIT_ITERINDEX(iter) = 0;
    memset(NIT_BASEOFFSETS(iter), 0, (nop+1)*NPY_SIZEOF_INTP);

    op = NIT_OPERANDS(iter);
    op_dtype = NIT_DTYPES(iter);
    op_itflags = NIT_OPITFLAGS(iter);
    op_dataptr = NIT_RESETDATAPTR(iter);

    /* Prepare all the operands */
    if (!npyiter_prepare_operands(nop, op_in, op, op_dataptr,
                        op_request_dtypes, op_dtype,
                        flags,
                        op_flags, op_itflags,
                        &NIT_MASKOP(iter))) {
        PyObject_Free(iter);
        return NULL;
    }
    /* Set resetindex to zero as well (it's just after the resetdataptr) */
    op_dataptr[nop] = 0;

    NPY_IT_TIME_POINT(c_prepare_operands);

    /*
     * Initialize buffer data (must set the buffers and transferdata
     * to NULL before we might deallocate the iterator).
     */
    if (itflags & NPY_ITFLAG_BUFFER) {
        bufferdata = NIT_BUFFERDATA(iter);
        NBF_SIZE(bufferdata) = 0;
        memset(NBF_BUFFERS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
        memset(NBF_PTRS(bufferdata), 0, nop*NPY_SIZEOF_INTP);
        /* Ensure that the transferdata/auxdata is NULLed */
        memset(NBF_TRANSFERINFO(bufferdata), 0, nop * sizeof(NpyIter_TransferInfo));
    }

    /* Fill in the AXISDATA arrays and set the ITERSIZE field */
    if (!npyiter_fill_axisdata(iter, flags, op_itflags, op_dataptr,
                                        op_flags, op_axes, itershape)) {
        NpyIter_Deallocate(iter);
        return NULL;
    }

    NPY_IT_TIME_POINT(c_fill_axisdata);

    if (itflags & NPY_ITFLAG_BUFFER) {
        /*
         * If buffering is enabled and no buffersize was given, use a default
         * chosen to be big enough to get some amortization benefits, but
         * small enough to be cache-friendly.
         */
        if (buffersize <= 0) {
            buffersize = NPY_BUFSIZE;
        }
        /* No point in a buffer bigger than the iteration size */
        if (buffersize > NIT_ITERSIZE(iter)) {
            buffersize = NIT_ITERSIZE(iter);
        }
        NBF_BUFFERSIZE(bufferdata) = buffersize;

        /*
         * Initialize for use in FirstVisit, which may be called before
         * the buffers are filled and the reduce pos is updated.
         */
        NBF_REDUCE_POS(bufferdata) = 0;
    }

    /*
     * If an index was requested, compute the strides for it.
     * Note that we must do this before changing the order of the
     * axes
     */
    npyiter_compute_index_strides(iter, flags);

    NPY_IT_TIME_POINT(c_compute_index_strides);

    /* Initialize the perm to the identity */
    perm = NIT_PERM(iter);
    for(idim = 0; idim < ndim; ++idim) {
        perm[idim] = (npy_int8)idim;
    }

    /*
     * If an iteration order is being forced, apply it.
     */
    npyiter_apply_forced_iteration_order(iter, order);
    itflags = NIT_ITFLAGS(iter);

    NPY_IT_TIME_POINT(c_apply_forced_iteration_order);

    /* Set some flags for allocated outputs */
    for (iop = 0; iop < nop; ++iop) {
        if (op[iop] == NULL) {
            /* Flag this so later we can avoid flipping axes */
            any_allocate = 1;
            /* If a subtype may be used, indicate so */
            if (!(op_flags[iop] & NPY_ITER_NO_SUBTYPE)) {
                need_subtype = 1;
            }
            /*
             * If the data type wasn't provided, will need to
             * calculate it.
             */
            if (op_dtype[iop] == NULL) {
                any_missing_dtypes = 1;
            }
        }
    }

    /*
     * If the ordering was not forced, reorder the axes
     * and flip negative strides to find the best one.
     */
    if (!(itflags & NPY_ITFLAG_FORCEDORDER)) {
        if (ndim > 1) {
            npyiter_find_best_axis_ordering(iter);
        }
        /*
         * If there's an output being allocated, we must not negate
         * any strides.
         */
        if (!any_allocate && !(flags & NPY_ITER_DONT_NEGATE_STRIDES)) {
            npyiter_flip_negative_strides(iter);
        }
        itflags = NIT_ITFLAGS(iter);
    }

    NPY_IT_TIME_POINT(c_find_best_axis_ordering);

    if (need_subtype) {
        npyiter_get_priority_subtype(nop, op, op_itflags,
                                     &subtype_priority, &subtype);
    }

    NPY_IT_TIME_POINT(c_get_priority_subtype);

    /*
     * If an automatically allocated output didn't have a specified
     * dtype, we need to figure it out now, before allocating the outputs.
     */
    if (any_missing_dtypes || (flags & NPY_ITER_COMMON_DTYPE)) {
        PyArray_Descr *dtype;
        int only_inputs = !(flags & NPY_ITER_COMMON_DTYPE);

        op = NIT_OPERANDS(iter);
        op_dtype = NIT_DTYPES(iter);

        dtype = npyiter_get_common_dtype(nop, op,
                                    op_itflags, op_dtype,
                                    op_request_dtypes,
                                    only_inputs);
        if (dtype == NULL) {
            NpyIter_Deallocate(iter);
            return NULL;
        }
        if (flags & NPY_ITER_COMMON_DTYPE) {
            NPY_IT_DBG_PRINT("Iterator: Replacing all data types\n");
            /* Replace all the data types */
            for (iop = 0; iop < nop; ++iop) {
                if (op_dtype[iop] != dtype) {
                    Py_XDECREF(op_dtype[iop]);
                    Py_INCREF(dtype);
                    op_dtype[iop] = dtype;
                }
            }
        }
        else {
            NPY_IT_DBG_PRINT("Iterator: Setting unset output data types\n");
            /* Replace the NULL data types */
            for (iop = 0; iop < nop; ++iop) {
                if (op_dtype[iop] == NULL) {
                    Py_INCREF(dtype);
                    op_dtype[iop] = dtype;
                }
            }
        }
        Py_DECREF(dtype);
    }

    NPY_IT_TIME_POINT(c_find_output_common_dtype);

    /*
     * All of the data types have been settled, so it's time
     * to check that data type conversions are following the
     * casting rules.
     */
    if (!npyiter_check_casting(nop, op, op_dtype, casting, op_itflags)) {
        NpyIter_Deallocate(iter);
        return NULL;
    }

    NPY_IT_TIME_POINT(c_check_casting);

    /*
     * At this point, the iteration order has been finalized. so
     * any allocation of ops that were NULL, or any temporary
     * copying due to casting/byte order/alignment can be
     * done now using a memory layout matching the iterator.
     */
    if (!npyiter_allocate_arrays(iter, flags, op_dtype, subtype, op_flags,
                            op_itflags, op_axes)) {
        NpyIter_Deallocate(iter);
        return NULL;
    }

    NPY_IT_TIME_POINT(c_allocate_arrays);

    /*
     * Finally, if a multi-index wasn't requested,
     * it may be possible to coalesce some axes together.
     */
    if (ndim > 1 && !(itflags & NPY_ITFLAG_HASMULTIINDEX)) {
        npyiter_coalesce_axes(iter);
        /*
         * The operation may have changed the layout, so we have to
         * get the internal pointers again.
         */
        itflags = NIT_ITFLAGS(iter);
        ndim = NIT_NDIM(iter);
        op = NIT_OPERANDS(iter);
        op_dtype = NIT_DTYPES(iter);
        op_itflags = NIT_OPITFLAGS(iter);
        op_dataptr = NIT_RESETDATAPTR(iter);
    }

    NPY_IT_TIME_POINT(c_coalesce_axes);

    /*
     * Now that the axes are finished, check whether we can apply
     * the single iteration optimization to the iternext function.
     */
    if (!(itflags & NPY_ITFLAG_BUFFER)) {
        NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
        if (itflags & NPY_ITFLAG_EXLOOP) {
            if (NIT_ITERSIZE(iter) == NAD_SHAPE(axisdata)) {
                NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
            }
        }
        else if (NIT_ITERSIZE(iter) == 1) {
            NIT_ITFLAGS(iter) |= NPY_ITFLAG_ONEITERATION;
        }
    }

    /*
     * If REFS_OK was specified, check whether there are any
     * reference arrays and flag it if so.
     *
     * NOTE: This really should be unnecessary, but chances are someone relies
     *       on it.  The iterator itself does not require the API here
     *       as it only does so for casting/buffering.  But in almost all
     *       use-cases the API will be required for whatever operation is done.
     */
    if (flags & NPY_ITER_REFS_OK) {
        for (iop = 0; iop < nop; ++iop) {
            PyArray_Descr *rdt = op_dtype[iop];
            if ((rdt->flags & (NPY_ITEM_REFCOUNT |
                                     NPY_ITEM_IS_POINTER |
                                     NPY_NEEDS_PYAPI)) != 0) {
                /* Iteration needs API access */
                NIT_ITFLAGS(iter) |= NPY_ITFLAG_NEEDSAPI;
            }
        }
    }

    /* If buffering is set without delayed allocation */
    if (itflags & NPY_ITFLAG_BUFFER) {
        if (!npyiter_allocate_transfer_functions(iter)) {
            NpyIter_Deallocate(iter);
            return NULL;
        }
        if (!(itflags & NPY_ITFLAG_DELAYBUF)) {
            /* Allocate the buffers */
            if (!npyiter_allocate_buffers(iter, NULL)) {
                NpyIter_Deallocate(iter);
                return NULL;
            }

            /* Prepare the next buffers and set iterend/size */
            if (npyiter_copy_to_buffers(iter, NULL) < 0) {
                NpyIter_Deallocate(iter);
                return NULL;
            }
        }
    }

    NPY_IT_TIME_POINT(c_prepare_buffers);

#if NPY_IT_CONSTRUCTION_TIMING
    printf("\nIterator construction timing:\n");
    NPY_IT_PRINT_TIME_START(c_start);
    NPY_IT_PRINT_TIME_VAR(c_check_op_axes);
    NPY_IT_PRINT_TIME_VAR(c_check_global_flags);
    NPY_IT_PRINT_TIME_VAR(c_calculate_ndim);
    NPY_IT_PRINT_TIME_VAR(c_malloc);
    NPY_IT_PRINT_TIME_VAR(c_prepare_operands);
    NPY_IT_PRINT_TIME_VAR(c_fill_axisdata);
    NPY_IT_PRINT_TIME_VAR(c_compute_index_strides);
    NPY_IT_PRINT_TIME_VAR(c_apply_forced_iteration_order);
    NPY_IT_PRINT_TIME_VAR(c_find_best_axis_ordering);
    NPY_IT_PRINT_TIME_VAR(c_get_priority_subtype);
    NPY_IT_PRINT_TIME_VAR(c_find_output_common_dtype);
    NPY_IT_PRINT_TIME_VAR(c_check_casting);
    NPY_IT_PRINT_TIME_VAR(c_allocate_arrays);
    NPY_IT_PRINT_TIME_VAR(c_coalesce_axes);
    NPY_IT_PRINT_TIME_VAR(c_prepare_buffers);
    printf("\n");
#endif

    return iter;
}

/*NUMPY_API
 * Allocate a new iterator for more than one array object, using
 * standard NumPy broadcasting rules and the default buffer size.
 */
NPY_NO_EXPORT NpyIter *
NpyIter_MultiNew(int nop, PyArrayObject **op_in, npy_uint32 flags,
                 NPY_ORDER order, NPY_CASTING casting,
                 npy_uint32 *op_flags,
                 PyArray_Descr **op_request_dtypes)
{
    return NpyIter_AdvancedNew(nop, op_in, flags, order, casting,
                            op_flags, op_request_dtypes,
                            -1, NULL, NULL, 0);
}

/*NUMPY_API
 * Allocate a new iterator for one array object.
 */
NPY_NO_EXPORT NpyIter *
NpyIter_New(PyArrayObject *op, npy_uint32 flags,
                  NPY_ORDER order, NPY_CASTING casting,
                  PyArray_Descr* dtype)
{
    /* Split the flags into separate global and op flags */
    npy_uint32 op_flags = flags & NPY_ITER_PER_OP_FLAGS;
    flags &= NPY_ITER_GLOBAL_FLAGS;

    return NpyIter_AdvancedNew(1, &op, flags, order, casting,
                            &op_flags, &dtype,
                            -1, NULL, NULL, 0);
}

/*NUMPY_API
 * Makes a copy of the iterator
 */
NPY_NO_EXPORT NpyIter *
NpyIter_Copy(NpyIter *iter)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int ndim = NIT_NDIM(iter);
    int iop, nop = NIT_NOP(iter);
    int out_of_memory = 0;

    npy_intp size;
    NpyIter *newiter;
    PyArrayObject **objects;
    PyArray_Descr **dtypes;

    /* Allocate memory for the new iterator */
    size = NIT_SIZEOF_ITERATOR(itflags, ndim, nop);
    newiter = (NpyIter*)PyObject_Malloc(size);

    /* Copy the raw values to the new iterator */
    memcpy(newiter, iter, size);

    /* Take ownership of references to the operands and dtypes */
    objects = NIT_OPERANDS(newiter);
    dtypes = NIT_DTYPES(newiter);
    for (iop = 0; iop < nop; ++iop) {
        Py_INCREF(objects[iop]);
        Py_INCREF(dtypes[iop]);
    }

    /* Allocate buffers and make copies of the transfer data if necessary */
    if (itflags & NPY_ITFLAG_BUFFER) {
        NpyIter_BufferData *bufferdata;
        npy_intp buffersize, itemsize;
        char **buffers;

        bufferdata = NIT_BUFFERDATA(newiter);
        buffers = NBF_BUFFERS(bufferdata);
        buffersize = NBF_BUFFERSIZE(bufferdata);
        NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);

        for (iop = 0; iop < nop; ++iop) {
            if (buffers[iop] != NULL) {
                if (out_of_memory) {
                    buffers[iop] = NULL;
                }
                else {
                    itemsize = dtypes[iop]->elsize;
                    buffers[iop] = PyArray_malloc(itemsize*buffersize);
                    if (buffers[iop] == NULL) {
                        out_of_memory = 1;
                    }
                    else {
                        if (PyDataType_FLAGCHK(dtypes[iop], NPY_NEEDS_INIT)) {
                            memset(buffers[iop], '\0', itemsize*buffersize);
                        }
                    }
                }
            }

            if (transferinfo[iop].read.func != NULL) {
                if (out_of_memory) {
                    transferinfo[iop].read.func = NULL;  /* No cleanup */
                }
                else {
                    if (NPY_cast_info_copy(&transferinfo[iop].read,
                                           &transferinfo[iop].read) < 0) {
                        out_of_memory = 1;
                    }
                }
            }

            if (transferinfo[iop].write.func != NULL) {
                if (out_of_memory) {
                    transferinfo[iop].write.func = NULL;  /* No cleanup */
                }
                else {
                    if (NPY_cast_info_copy(&transferinfo[iop].write,
                                           &transferinfo[iop].write) < 0) {
                        out_of_memory = 1;
                    }
                }
            }

            if (transferinfo[iop].clear.func != NULL) {
                if (out_of_memory) {
                    transferinfo[iop].clear.func = NULL;  /* No cleanup */
                }
                else {
                    if (NPY_traverse_info_copy(&transferinfo[iop].clear,
                                               &transferinfo[iop].clear) < 0) {
                        out_of_memory = 1;
                    }
                }
            }
        }

        /* Initialize the buffers to the current iterindex */
        if (!out_of_memory && NBF_SIZE(bufferdata) > 0) {
            npyiter_goto_iterindex(newiter, NIT_ITERINDEX(newiter));

            /* Prepare the next buffers and set iterend/size */
            npyiter_copy_to_buffers(newiter, NULL);
        }
    }

    if (out_of_memory) {
        NpyIter_Deallocate(newiter);
        PyErr_NoMemory();
        return NULL;
    }

    return newiter;
}

/*NUMPY_API
 * Deallocate an iterator.
 *
 * To correctly work when an error is in progress, we have to check
 * `PyErr_Occurred()`. This is necessary when buffers are not finalized
 * or WritebackIfCopy is used. We could avoid that check by exposing a new
 * function which is passed in whether or not a Python error is already set.
 */
NPY_NO_EXPORT int
NpyIter_Deallocate(NpyIter *iter)
{
    int success = PyErr_Occurred() == NULL;

    npy_uint32 itflags;
    /*int ndim = NIT_NDIM(iter);*/
    int iop, nop;
    PyArray_Descr **dtype;
    PyArrayObject **object;
    npyiter_opitflags *op_itflags;

    if (iter == NULL) {
        return success;
    }

    itflags = NIT_ITFLAGS(iter);
    nop = NIT_NOP(iter);
    dtype = NIT_DTYPES(iter);
    object = NIT_OPERANDS(iter);
    op_itflags = NIT_OPITFLAGS(iter);

    /* Deallocate any buffers and buffering data */
    if (itflags & NPY_ITFLAG_BUFFER) {
        /* Ensure no data is held by the buffers before they are cleared */
        if (success) {
            if (npyiter_copy_from_buffers(iter) < 0) {
                success = NPY_FAIL;
            }
        }
        else {
            npyiter_clear_buffers(iter);
        }

        NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
        char **buffers;

        /* buffers */
        buffers = NBF_BUFFERS(bufferdata);
        for (iop = 0; iop < nop; ++iop, ++buffers) {
            PyArray_free(*buffers);
        }

        NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);
        /* read bufferdata */
        for (iop = 0; iop < nop; ++iop, ++transferinfo) {
            NPY_cast_info_xfree(&transferinfo->read);
            NPY_cast_info_xfree(&transferinfo->write);
            NPY_traverse_info_xfree(&transferinfo->clear);
        }
    }

    /*
     * Deallocate all the dtypes and objects that were iterated and resolve
     * any writeback buffers created by the iterator.
     */
    for (iop = 0; iop < nop; ++iop, ++dtype, ++object) {
        if (op_itflags[iop] & NPY_OP_ITFLAG_HAS_WRITEBACK) {
            if (success && PyArray_ResolveWritebackIfCopy(*object) < 0) {
                success = 0;
            }
            else {
                PyArray_DiscardWritebackIfCopy(*object);
            }
        }
        Py_XDECREF(*dtype);
        Py_XDECREF(*object);
    }

    /* Deallocate the iterator memory */
    PyObject_Free(iter);
    return success;
}


/* Checks 'flags' for (C|F)_ORDER_INDEX, MULTI_INDEX, and EXTERNAL_LOOP,
 * setting the appropriate internal flags in 'itflags'.
 *
 * Returns 1 on success, 0 on error.
 */
static int
npyiter_check_global_flags(npy_uint32 flags, npy_uint32* itflags)
{
    if ((flags & NPY_ITER_PER_OP_FLAGS) != 0) {
        PyErr_SetString(PyExc_ValueError,
                    "A per-operand flag was passed as a global flag "
                    "to the iterator constructor");
        return 0;
    }

    /* Check for an index */
    if (flags & (NPY_ITER_C_INDEX | NPY_ITER_F_INDEX)) {
        if ((flags & (NPY_ITER_C_INDEX | NPY_ITER_F_INDEX)) ==
                    (NPY_ITER_C_INDEX | NPY_ITER_F_INDEX)) {
            PyErr_SetString(PyExc_ValueError,
                    "Iterator flags C_INDEX and "
                    "F_INDEX cannot both be specified");
            return 0;
        }
        (*itflags) |= NPY_ITFLAG_HASINDEX;
    }
    /* Check if a multi-index was requested */
    if (flags & NPY_ITER_MULTI_INDEX) {
        /*
         * This flag primarily disables dimension manipulations that
         * would produce an incorrect multi-index.
         */
        (*itflags) |= NPY_ITFLAG_HASMULTIINDEX;
    }
    /* Check if the caller wants to handle inner iteration */
    if (flags & NPY_ITER_EXTERNAL_LOOP) {
        if ((*itflags) & (NPY_ITFLAG_HASINDEX | NPY_ITFLAG_HASMULTIINDEX)) {
            PyErr_SetString(PyExc_ValueError,
                    "Iterator flag EXTERNAL_LOOP cannot be used "
                    "if an index or multi-index is being tracked");
            return 0;
        }
        (*itflags) |= NPY_ITFLAG_EXLOOP;
    }
    /* Ranged */
    if (flags & NPY_ITER_RANGED) {
        (*itflags) |= NPY_ITFLAG_RANGE;
        if ((flags & NPY_ITER_EXTERNAL_LOOP) &&
                                    !(flags & NPY_ITER_BUFFERED)) {
            PyErr_SetString(PyExc_ValueError,
                    "Iterator flag RANGED cannot be used with "
                    "the flag EXTERNAL_LOOP unless "
                    "BUFFERED is also enabled");
            return 0;
        }
    }
    /* Buffering */
    if (flags & NPY_ITER_BUFFERED) {
        (*itflags) |= NPY_ITFLAG_BUFFER;
        if (flags & NPY_ITER_GROWINNER) {
            (*itflags) |= NPY_ITFLAG_GROWINNER;
        }
        if (flags & NPY_ITER_DELAY_BUFALLOC) {
            (*itflags) |= NPY_ITFLAG_DELAYBUF;
        }
    }

    return 1;
}

static int
npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
                        const npy_intp *itershape)
{
    char axes_dupcheck[NPY_MAXDIMS];
    int iop, idim;

    if (oa_ndim < 0) {
        /*
         * If `oa_ndim < 0`, `op_axes` and `itershape` are signalled to
         * be unused and should be NULL. (Before NumPy 1.8 this was
         * signalled by `oa_ndim == 0`.)
         */
        if (op_axes != NULL || itershape != NULL) {
            PyErr_Format(PyExc_ValueError,
                    "If 'op_axes' or 'itershape' is not NULL in the iterator "
                    "constructor, 'oa_ndim' must be zero or greater");
            return 0;
        }
        return 1;
    }
    if (oa_ndim > NPY_MAXDIMS) {
        PyErr_Format(PyExc_ValueError,
                "Cannot construct an iterator with more than %d dimensions "
                "(%d were requested for op_axes)",
                NPY_MAXDIMS, oa_ndim);
        return 0;
    }
    if (op_axes == NULL) {
        PyErr_Format(PyExc_ValueError,
                "If 'oa_ndim' is zero or greater in the iterator "
                "constructor, then op_axes cannot be NULL");
        return 0;
    }

    /* Check that there are no duplicates in op_axes */
    for (iop = 0; iop < nop; ++iop) {
        int *axes = op_axes[iop];
        if (axes != NULL) {
            memset(axes_dupcheck, 0, NPY_MAXDIMS);
            for (idim = 0; idim < oa_ndim; ++idim) {
                int i = npyiter_get_op_axis(axes[idim], NULL);

                if (i >= 0) {
                    if (i >= NPY_MAXDIMS) {
                        PyErr_Format(PyExc_ValueError,
                                "The 'op_axes' provided to the iterator "
                                "constructor for operand %d "
                                "contained invalid "
                                "values %d", iop, i);
                        return 0;
                    }
                    else if (axes_dupcheck[i] == 1) {
                        PyErr_Format(PyExc_ValueError,
                                "The 'op_axes' provided to the iterator "
                                "constructor for operand %d "
                                "contained duplicate "
                                "value %d", iop, i);
                        return 0;
                    }
                    else {
                        axes_dupcheck[i] = 1;
                    }
                }
            }
        }
    }

    return 1;
}

static int
npyiter_calculate_ndim(int nop, PyArrayObject **op_in,
                       int oa_ndim)
{
    /* If 'op_axes' is being used, force 'ndim' */
    if (oa_ndim >= 0 ) {
        return oa_ndim;
    }
    /* Otherwise it's the maximum 'ndim' from the operands */
    else {
        int ndim = 0, iop;

        for (iop = 0; iop < nop; ++iop) {
            if (op_in[iop] != NULL) {
                int ondim = PyArray_NDIM(op_in[iop]);
                if (ondim > ndim) {
                    ndim = ondim;
                }
            }

        }

        return ndim;
    }
}

/*
 * Checks the per-operand input flags, and fills in op_itflags.
 *
 * Returns 1 on success, 0 on failure.
 */
static int
npyiter_check_per_op_flags(npy_uint32 op_flags, npyiter_opitflags *op_itflags)
{
    if ((op_flags & NPY_ITER_GLOBAL_FLAGS) != 0) {
        PyErr_SetString(PyExc_ValueError,
                    "A global iterator flag was passed as a per-operand flag "
                    "to the iterator constructor");
        return 0;
    }

    /* Check the read/write flags */
    if (op_flags & NPY_ITER_READONLY) {
        /* The read/write flags are mutually exclusive */
        if (op_flags & (NPY_ITER_READWRITE|NPY_ITER_WRITEONLY)) {
            PyErr_SetString(PyExc_ValueError,
                    "Only one of the iterator flags READWRITE, "
                    "READONLY, and WRITEONLY may be "
                    "specified for an operand");
            return 0;
        }

        *op_itflags = NPY_OP_ITFLAG_READ;
    }
    else if (op_flags & NPY_ITER_READWRITE) {
        /* The read/write flags are mutually exclusive */
        if (op_flags & NPY_ITER_WRITEONLY) {
            PyErr_SetString(PyExc_ValueError,
                    "Only one of the iterator flags READWRITE, "
                    "READONLY, and WRITEONLY may be "
                    "specified for an operand");
            return 0;
        }

        *op_itflags = NPY_OP_ITFLAG_READ|NPY_OP_ITFLAG_WRITE;
    }
    else if(op_flags & NPY_ITER_WRITEONLY) {
        *op_itflags = NPY_OP_ITFLAG_WRITE;
    }
    else {
        PyErr_SetString(PyExc_ValueError,
                "None of the iterator flags READWRITE, "
                "READONLY, or WRITEONLY were "
                "specified for an operand");
        return 0;
    }

    /* Check the flags for temporary copies */
    if (((*op_itflags) & NPY_OP_ITFLAG_WRITE) &&
                (op_flags & (NPY_ITER_COPY |
                           NPY_ITER_UPDATEIFCOPY)) == NPY_ITER_COPY) {
        PyErr_SetString(PyExc_ValueError,
                "If an iterator operand is writeable, must use "
                "the flag UPDATEIFCOPY instead of "
                "COPY");
        return 0;
    }

    /* Check the flag for a write masked operands */
    if (op_flags & NPY_ITER_WRITEMASKED) {
        if (!((*op_itflags) & NPY_OP_ITFLAG_WRITE)) {
            PyErr_SetString(PyExc_ValueError,
                "The iterator flag WRITEMASKED may only "
                "be used with READWRITE or WRITEONLY");
            return 0;
        }
        if ((op_flags & NPY_ITER_ARRAYMASK) != 0) {
            PyErr_SetString(PyExc_ValueError,
                "The iterator flag WRITEMASKED may not "
                "be used together with ARRAYMASK");
            return 0;
        }
        *op_itflags |= NPY_OP_ITFLAG_WRITEMASKED;
    }

    if ((op_flags & NPY_ITER_VIRTUAL) != 0) {
        if ((op_flags & NPY_ITER_READWRITE) == 0) {
            PyErr_SetString(PyExc_ValueError,
                "The iterator flag VIRTUAL should be "
                "be used together with READWRITE");
            return 0;
        }
        *op_itflags |= NPY_OP_ITFLAG_VIRTUAL;
    }

    return 1;
}

/*
 * Prepares a constructor operand.  Assumes a reference to 'op'
 * is owned, and that 'op' may be replaced.  Fills in 'op_dataptr',
 * 'op_dtype', and may modify 'op_itflags'.
 *
 * Returns 1 on success, 0 on failure.
 */
static int
npyiter_prepare_one_operand(PyArrayObject **op,
                        char **op_dataptr,
                        PyArray_Descr *op_request_dtype,
                        PyArray_Descr **op_dtype,
                        npy_uint32 flags,
                        npy_uint32 op_flags, npyiter_opitflags *op_itflags)
{
    /* NULL operands must be automatically allocated outputs */
    if (*op == NULL) {
        /* ALLOCATE or VIRTUAL should be enabled */
        if ((op_flags & (NPY_ITER_ALLOCATE|NPY_ITER_VIRTUAL)) == 0) {
            PyErr_SetString(PyExc_ValueError,
                    "Iterator operand was NULL, but neither the "
                    "ALLOCATE nor the VIRTUAL flag was specified");
            return 0;
        }

        if (op_flags & NPY_ITER_ALLOCATE) {
            /* Writing should be enabled */
            if (!((*op_itflags) & NPY_OP_ITFLAG_WRITE)) {
                PyErr_SetString(PyExc_ValueError,
                        "Automatic allocation was requested for an iterator "
                        "operand, but it wasn't flagged for writing");
                return 0;
            }
            /*
             * Reading should be disabled if buffering is enabled without
             * also enabling NPY_ITER_DELAY_BUFALLOC.  In all other cases,
             * the caller may initialize the allocated operand to a value
             * before beginning iteration.
             */
            if (((flags & (NPY_ITER_BUFFERED |
                            NPY_ITER_DELAY_BUFALLOC)) == NPY_ITER_BUFFERED) &&
                    ((*op_itflags) & NPY_OP_ITFLAG_READ)) {
                PyErr_SetString(PyExc_ValueError,
                        "Automatic allocation was requested for an iterator "
                        "operand, and it was flagged as readable, but "
                        "buffering  without delayed allocation was enabled");
                return 0;
            }

            /* If a requested dtype was provided, use it, otherwise NULL */
            Py_XINCREF(op_request_dtype);
            *op_dtype = op_request_dtype;
        }
        else {
            *op_dtype = NULL;
        }

        /* Specify bool if no dtype was requested for the mask */
        if (op_flags & NPY_ITER_ARRAYMASK) {
            if (*op_dtype == NULL) {
                *op_dtype = PyArray_DescrFromType(NPY_BOOL);
                if (*op_dtype == NULL) {
                    return 0;
                }
            }
        }

        *op_dataptr = NULL;

        return 1;
    }

    /* VIRTUAL operands must be NULL */
    if (op_flags & NPY_ITER_VIRTUAL) {
        PyErr_SetString(PyExc_ValueError,
                "Iterator operand flag VIRTUAL was specified, "
                "but the operand was not NULL");
        return 0;
    }


    if (PyArray_Check(*op)) {

        if ((*op_itflags) & NPY_OP_ITFLAG_WRITE
            && PyArray_FailUnlessWriteable(*op, "operand array with iterator "
                                           "write flag set") < 0) {
            return 0;
        }
        if (!(flags & NPY_ITER_ZEROSIZE_OK) && PyArray_SIZE(*op) == 0) {
            PyErr_SetString(PyExc_ValueError,
                    "Iteration of zero-sized operands is not enabled");
            return 0;
        }
        *op_dataptr = PyArray_BYTES(*op);
        /* PyArray_DESCR does not give us a reference */
        *op_dtype = PyArray_DESCR(*op);
        if (*op_dtype == NULL) {
            PyErr_SetString(PyExc_ValueError,
                    "Iterator input operand has no dtype descr");
            return 0;
        }
        Py_INCREF(*op_dtype);
        /*
         * If references weren't specifically allowed, make sure there
         * are no references in the inputs or requested dtypes.
         */
        if (!(flags & NPY_ITER_REFS_OK)) {
            PyArray_Descr *dt = PyArray_DESCR(*op);
            if (((dt->flags & (NPY_ITEM_REFCOUNT |
                           NPY_ITEM_IS_POINTER)) != 0) ||
                    (dt != *op_dtype &&
                        (((*op_dtype)->flags & (NPY_ITEM_REFCOUNT |
                                             NPY_ITEM_IS_POINTER))) != 0)) {
                PyErr_SetString(PyExc_TypeError,
                        "Iterator operand or requested dtype holds "
                        "references, but the NPY_ITER_REFS_OK flag was not "
                        "enabled");
                return 0;
            }
        }
        /*
         * Checking whether casts are valid is done later, once the
         * final data types have been selected.  For now, just store the
         * requested type.
         */
        if (op_request_dtype != NULL) {
            /* We just have a borrowed reference to op_request_dtype */
            Py_SETREF(*op_dtype, PyArray_AdaptDescriptorToArray(
                                            *op, NULL, op_request_dtype));
            if (*op_dtype == NULL) {
                return 0;
            }
        }

        /* Check if the operand is in the byte order requested */
        if (op_flags & NPY_ITER_NBO) {
            /* Check byte order */
            if (!PyArray_ISNBO((*op_dtype)->byteorder)) {
                /* Replace with a new descr which is in native byte order */
                Py_SETREF(*op_dtype,
                          PyArray_DescrNewByteorder(*op_dtype, NPY_NATIVE));
                if (*op_dtype == NULL) {
                    return 0;
                }
                NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                    "because of NPY_ITER_NBO\n");
                /* Indicate that byte order or alignment needs fixing */
                *op_itflags |= NPY_OP_ITFLAG_CAST;
            }
        }
        /* Check if the operand is aligned */
        if (op_flags & NPY_ITER_ALIGNED) {
            /* Check alignment */
            if (!IsAligned(*op)) {
                NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                    "because of NPY_ITER_ALIGNED\n");
                *op_itflags |= NPY_OP_ITFLAG_CAST;
            }
        }
        /*
         * The check for NPY_ITER_CONTIG can only be done later,
         * once the final iteration order is settled.
         */
    }
    else {
        PyErr_SetString(PyExc_ValueError,
                "Iterator inputs must be ndarrays");
        return 0;
    }

    return 1;
}

/*
 * Process all the operands, copying new references so further processing
 * can replace the arrays if copying is necessary.
 */
static int
npyiter_prepare_operands(int nop, PyArrayObject **op_in,
                    PyArrayObject **op,
                    char **op_dataptr,
                    PyArray_Descr **op_request_dtypes,
                    PyArray_Descr **op_dtype,
                    npy_uint32 flags,
                    npy_uint32 *op_flags, npyiter_opitflags *op_itflags,
                    npy_int8 *out_maskop)
{
    int iop, i;
    npy_int8 maskop = -1;
    int any_writemasked_ops = 0;

    /*
     * Here we just prepare the provided operands.
     */
    for (iop = 0; iop < nop; ++iop) {
        op[iop] = op_in[iop];
        Py_XINCREF(op[iop]);
        op_dtype[iop] = NULL;

        /* Check the readonly/writeonly flags, and fill in op_itflags */
        if (!npyiter_check_per_op_flags(op_flags[iop], &op_itflags[iop])) {
            goto fail_iop;
        }

        /* Extract the operand which is for masked iteration */
        if ((op_flags[iop] & NPY_ITER_ARRAYMASK) != 0) {
            if (maskop != -1) {
                PyErr_SetString(PyExc_ValueError,
                        "Only one iterator operand may receive an "
                        "ARRAYMASK flag");
                goto fail_iop;
            }

            maskop = iop;
            *out_maskop = iop;
        }

        if (op_flags[iop] & NPY_ITER_WRITEMASKED) {
            any_writemasked_ops = 1;
        }

        /*
         * Prepare the operand.  This produces an op_dtype[iop] reference
         * on success.
         */
        if (!npyiter_prepare_one_operand(&op[iop],
                        &op_dataptr[iop],
                        op_request_dtypes ? op_request_dtypes[iop] : NULL,
                        &op_dtype[iop],
                        flags,
                        op_flags[iop], &op_itflags[iop])) {
            goto fail_iop;
        }
    }

    if (any_writemasked_ops && maskop < 0) {
        PyErr_SetString(PyExc_ValueError,
                "An iterator operand was flagged as WRITEMASKED, "
                "but no ARRAYMASK operand was given to supply "
                "the mask");
        goto fail_nop;
    }
    else if (!any_writemasked_ops && maskop >= 0) {
        PyErr_SetString(PyExc_ValueError,
                "An iterator operand was flagged as the ARRAYMASK, "
                "but no WRITEMASKED operands were given to use "
                "the mask");
        goto fail_nop;
    }

    return 1;

  fail_nop:
    iop = nop - 1;
  fail_iop:
    for (i = 0; i < iop+1; ++i) {
        Py_XDECREF(op[i]);
        Py_XDECREF(op_dtype[i]);
    }
    return 0;
}

static const char *
npyiter_casting_to_string(NPY_CASTING casting)
{
    switch (casting) {
        case NPY_NO_CASTING:
            return "'no'";
        case NPY_EQUIV_CASTING:
            return "'equiv'";
        case NPY_SAFE_CASTING:
            return "'safe'";
        case NPY_SAME_KIND_CASTING:
            return "'same_kind'";
        case NPY_UNSAFE_CASTING:
            return "'unsafe'";
        default:
            return "<unknown>";
    }
}


static int
npyiter_check_casting(int nop, PyArrayObject **op,
                    PyArray_Descr **op_dtype,
                    NPY_CASTING casting,
                    npyiter_opitflags *op_itflags)
{
    int iop;

    for(iop = 0; iop < nop; ++iop) {
        NPY_IT_DBG_PRINT1("Iterator: Checking casting for operand %d\n",
                            (int)iop);
#if NPY_IT_DBG_TRACING
        printf("op: ");
        if (op[iop] != NULL) {
            PyObject_Print((PyObject *)PyArray_DESCR(op[iop]), stdout, 0);
        }
        else {
            printf("<null>");
        }
        printf(", iter: ");
        PyObject_Print((PyObject *)op_dtype[iop], stdout, 0);
        printf("\n");
#endif
        /* If the types aren't equivalent, a cast is necessary */
        if (op[iop] != NULL && !PyArray_EquivTypes(PyArray_DESCR(op[iop]),
                                                     op_dtype[iop])) {
            /* Check read (op -> temp) casting */
            if ((op_itflags[iop] & NPY_OP_ITFLAG_READ) &&
                        !PyArray_CanCastArrayTo(op[iop],
                                          op_dtype[iop],
                                          casting)) {
                PyErr_Format(PyExc_TypeError,
                        "Iterator operand %d dtype could not be cast from "
                        "%R to %R according to the rule %s",
                        iop, PyArray_DESCR(op[iop]), op_dtype[iop],
                        npyiter_casting_to_string(casting));
                return 0;
            }
            /* Check write (temp -> op) casting */
            if ((op_itflags[iop] & NPY_OP_ITFLAG_WRITE) &&
                        !PyArray_CanCastTypeTo(op_dtype[iop],
                                          PyArray_DESCR(op[iop]),
                                          casting)) {
                PyErr_Format(PyExc_TypeError,
                        "Iterator requested dtype could not be cast from "
                        "%R to %R, the operand %d dtype, "
                        "according to the rule %s",
                        op_dtype[iop], PyArray_DESCR(op[iop]), iop,
                        npyiter_casting_to_string(casting));
                return 0;
            }

            NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                "because the types aren't equivalent\n");
            /* Indicate that this operand needs casting */
            op_itflags[iop] |= NPY_OP_ITFLAG_CAST;
        }
    }

    return 1;
}

/*
 * Checks that the mask broadcasts to the WRITEMASK REDUCE
 * operand 'iop', but 'iop' never broadcasts to the mask.
 * If 'iop' broadcasts to the mask, the result would be more
 * than one mask value per reduction element, something which
 * is invalid.
 *
 * This check should only be called after all the operands
 * have been filled in.
 *
 * Returns 1 on success, 0 on error.
 */
static int
check_mask_for_writemasked_reduction(NpyIter *iter, int iop)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int nop = NIT_NOP(iter);
    int maskop = NIT_MASKOP(iter);

    NpyIter_AxisData *axisdata;
    npy_intp sizeof_axisdata;

    axisdata = NIT_AXISDATA(iter);
    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);

    for(idim = 0; idim < ndim; ++idim) {
        npy_intp maskstride, istride;

        istride = NAD_STRIDES(axisdata)[iop];
        maskstride = NAD_STRIDES(axisdata)[maskop];

        /*
         * If 'iop' is being broadcast to 'maskop', we have
         * the invalid situation described above.
         */
        if (maskstride != 0 && istride == 0) {
            PyErr_SetString(PyExc_ValueError,
                    "Iterator reduction operand is WRITEMASKED, "
                    "but also broadcasts to multiple mask values. "
                    "There can be only one mask value per WRITEMASKED "
                    "element.");
            return 0;
        }

        NIT_ADVANCE_AXISDATA(axisdata, 1);
    }

    return 1;
}

/*
 * Check whether a reduction is OK based on the flags and the operand being
 * readwrite. This path is deprecated, since usually only specific axes
 * should be reduced. If axes are specified explicitly, the flag is
 * unnecessary.
 */
static int
npyiter_check_reduce_ok_and_set_flags(
        NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itflags,
        int iop, int maskop, int dim) {
    /* If it's writeable, this means a reduction */
    if (op_itflags[iop] & NPY_OP_ITFLAG_WRITE) {
        if (!(flags & NPY_ITER_REDUCE_OK)) {
            PyErr_Format(PyExc_ValueError,
                    "output operand requires a reduction along dimension %d, "
                    "but the reduction is not enabled. The dimension size of 1 "
                    "does not match the expected output shape.", dim);
            return 0;
        }
        if (!(op_itflags[iop] & NPY_OP_ITFLAG_READ)) {
            PyErr_SetString(PyExc_ValueError,
                    "output operand requires a reduction, but is flagged as "
                    "write-only, not read-write");
            return 0;
        }
        /*
         * The ARRAYMASK can't be a reduction, because
         * it would be possible to write back to the
         * array once when the ARRAYMASK says 'True',
         * then have the reduction on the ARRAYMASK
         * later flip to 'False', indicating that the
         * write back should never have been done,
         * and violating the strict masking semantics
         */
        if (iop == maskop) {
            PyErr_SetString(PyExc_ValueError,
                    "output operand requires a "
                    "reduction, but is flagged as "
                    "the ARRAYMASK operand which "
                    "is not permitted to be the "
                    "result of a reduction");
            return 0;
        }
        NPY_IT_DBG_PRINT("Iterator: Indicating that a reduction is"
                         "occurring\n");

        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
        op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
    }
    return 1;
}

/**
 * Removes the (additive) NPY_ITER_REDUCTION_AXIS indication and sets
 * is_forced_broadcast to 1 if it is set. Otherwise to 0.
 *
 * @param axis The op_axes[i] to normalize.
 * @param reduction_axis Output 1 if a reduction axis, otherwise 0.
 * @returns The normalized axis (without reduce axis flag).
 */
static inline int
npyiter_get_op_axis(int axis, npy_bool *reduction_axis) {
    npy_bool forced_broadcast = axis >= NPY_ITER_REDUCTION_AXIS(-1);

    if (reduction_axis != NULL) {
        *reduction_axis = forced_broadcast;
    }
    if (forced_broadcast) {
        return axis - NPY_ITER_REDUCTION_AXIS(0);
    }
    return axis;
}

/*
 * Fills in the AXISDATA for the 'nop' operands, broadcasting
 * the dimensionas as necessary.  Also fills
 * in the ITERSIZE data member.
 *
 * If op_axes is not NULL, it should point to an array of ndim-sized
 * arrays, one for each op.
 *
 * Returns 1 on success, 0 on failure.
 */
static int
npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itflags,
                    char **op_dataptr,
                    const npy_uint32 *op_flags, int **op_axes,
                    npy_intp const *itershape)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int iop, nop = NIT_NOP(iter);
    int maskop = NIT_MASKOP(iter);

    int ondim;
    NpyIter_AxisData *axisdata;
    npy_intp sizeof_axisdata;
    PyArrayObject **op = NIT_OPERANDS(iter), *op_cur;
    npy_intp broadcast_shape[NPY_MAXDIMS];

    /* First broadcast the shapes together */
    if (itershape == NULL) {
        for (idim = 0; idim < ndim; ++idim) {
            broadcast_shape[idim] = 1;
        }
    }
    else {
        for (idim = 0; idim < ndim; ++idim) {
            broadcast_shape[idim] = itershape[idim];
            /* Negative shape entries are deduced from the operands */
            if (broadcast_shape[idim] < 0) {
                broadcast_shape[idim] = 1;
            }
        }
    }
    for (iop = 0; iop < nop; ++iop) {
        op_cur = op[iop];
        if (op_cur != NULL) {
            npy_intp *shape = PyArray_DIMS(op_cur);
            ondim = PyArray_NDIM(op_cur);

            if (op_axes == NULL || op_axes[iop] == NULL) {
                /*
                 * Possible if op_axes are being used, but
                 * op_axes[iop] is NULL
                 */
                if (ondim > ndim) {
                    PyErr_SetString(PyExc_ValueError,
                            "input operand has more dimensions than allowed "
                            "by the axis remapping");
                    return 0;
                }
                for (idim = 0; idim < ondim; ++idim) {
                    npy_intp bshape = broadcast_shape[idim+ndim-ondim];
                    npy_intp op_shape = shape[idim];

                    if (bshape == 1) {
                        broadcast_shape[idim+ndim-ondim] = op_shape;
                    }
                    else if (bshape != op_shape && op_shape != 1) {
                        goto broadcast_error;
                    }
                }
            }
            else {
                int *axes = op_axes[iop];
                for (idim = 0; idim < ndim; ++idim) {
                    int i = npyiter_get_op_axis(axes[idim], NULL);

                    if (i >= 0) {
                        if (i < ondim) {
                            npy_intp bshape = broadcast_shape[idim];
                            npy_intp op_shape = shape[i];

                            if (bshape == 1) {
                                broadcast_shape[idim] = op_shape;
                            }
                            else if (bshape != op_shape && op_shape != 1) {
                                goto broadcast_error;
                            }
                        }
                        else {
                            PyErr_Format(PyExc_ValueError,
                                    "Iterator input op_axes[%d][%d] (==%d) "
                                    "is not a valid axis of op[%d], which "
                                    "has %d dimensions ",
                                    iop, (ndim-idim-1), i,
                                    iop, ondim);
                            return 0;
                        }
                    }
                }
            }
        }
    }
    /*
     * If a shape was provided with a 1 entry, make sure that entry didn't
     * get expanded by broadcasting.
     */
    if (itershape != NULL) {
        for (idim = 0; idim < ndim; ++idim) {
            if (itershape[idim] == 1 && broadcast_shape[idim] != 1) {
                goto broadcast_error;
            }
        }
    }

    axisdata = NIT_AXISDATA(iter);
    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);

    if (ndim == 0) {
        /* Need to fill the first axisdata, even if the iterator is 0-d */
        NAD_SHAPE(axisdata) = 1;
        NAD_INDEX(axisdata) = 0;
        memcpy(NAD_PTRS(axisdata), op_dataptr, NPY_SIZEOF_INTP*nop);
        memset(NAD_STRIDES(axisdata), 0, NPY_SIZEOF_INTP*nop);
    }

    /* Now process the operands, filling in the axisdata */
    for (idim = 0; idim < ndim; ++idim) {
        npy_intp bshape = broadcast_shape[ndim-idim-1];
        npy_intp *strides = NAD_STRIDES(axisdata);

        NAD_SHAPE(axisdata) = bshape;
        NAD_INDEX(axisdata) = 0;
        memcpy(NAD_PTRS(axisdata), op_dataptr, NPY_SIZEOF_INTP*nop);

        for (iop = 0; iop < nop; ++iop) {
            op_cur = op[iop];

            if (op_axes == NULL || op_axes[iop] == NULL) {
                if (op_cur == NULL) {
                    strides[iop] = 0;
                }
                else {
                    ondim = PyArray_NDIM(op_cur);
                    if (bshape == 1) {
                        strides[iop] = 0;
                        if (idim >= ondim &&
                                    (op_flags[iop] & NPY_ITER_NO_BROADCAST)) {
                            goto operand_different_than_broadcast;
                        }
                    }
                    else if (idim >= ondim ||
                                    PyArray_DIM(op_cur, ondim-idim-1) == 1) {
                        strides[iop] = 0;
                        if (op_flags[iop] & NPY_ITER_NO_BROADCAST) {
                            goto operand_different_than_broadcast;
                        }
                        /* If it's writeable, this means a reduction */
                        if (!npyiter_check_reduce_ok_and_set_flags(
                                iter, flags, op_itflags, iop, maskop, idim)) {
                            return 0;
                        }
                    }
                    else {
                        strides[iop] = PyArray_STRIDE(op_cur, ondim-idim-1);
                    }
                }
            }
            else {
                int *axes = op_axes[iop];
                npy_bool reduction_axis;
                int i;
                i = npyiter_get_op_axis(axes[ndim - idim - 1], &reduction_axis);

                if (reduction_axis) {
                    /* This is explicitly a reduction axis */
                    strides[iop] = 0;
                    NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
                    op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;

                    if (NPY_UNLIKELY((i >= 0) && (op_cur != NULL) &&
                            (PyArray_DIM(op_cur, i) != 1))) {
                        PyErr_Format(PyExc_ValueError,
                                "operand was set up as a reduction along axis "
                                "%d, but the length of the axis is %zd "
                                "(it has to be 1)",
                                i, (Py_ssize_t)PyArray_DIM(op_cur, i));
                        return 0;
                    }
                }
                else if (bshape == 1) {
                    /*
                     * If the full iterator shape is 1, zero always works.
                     * NOTE: We thus always allow broadcast dimensions (i = -1)
                     *       if the shape is 1.
                     */
                    strides[iop] = 0;
                }
                else if (i >= 0) {
                    if (op_cur == NULL) {
                        /* stride is filled later, shape will match `bshape` */
                        strides[iop] = 0;
                    }
                    else if (PyArray_DIM(op_cur, i) == 1) {
                        strides[iop] = 0;
                        if (op_flags[iop] & NPY_ITER_NO_BROADCAST) {
                            goto operand_different_than_broadcast;
                        }
                        if (!npyiter_check_reduce_ok_and_set_flags(
                                iter, flags, op_itflags, iop, maskop, i)) {
                            return 0;
                        }
                    }
                    else {
                        strides[iop] = PyArray_STRIDE(op_cur, i);
                    }
                }
                else {
                    strides[iop] = 0;
                    /*
                     * If deleting this axis produces a reduction, but
                     * reduction wasn't enabled, throw an error.
                     * NOTE: We currently always allow new-axis if the iteration
                     *       size is 1 (thus allowing broadcasting sometimes).
                     */
                    if (!npyiter_check_reduce_ok_and_set_flags(
                            iter, flags, op_itflags, iop, maskop, i)) {
                        return 0;
                    }
                }
            }
        }

        NIT_ADVANCE_AXISDATA(axisdata, 1);
    }

    /* Now fill in the ITERSIZE member */
    NIT_ITERSIZE(iter) = 1;
    for (idim = 0; idim < ndim; ++idim) {
        if (npy_mul_sizes_with_overflow(&NIT_ITERSIZE(iter),
                    NIT_ITERSIZE(iter), broadcast_shape[idim])) {
            if ((itflags & NPY_ITFLAG_HASMULTIINDEX) &&
                    !(itflags & NPY_ITFLAG_HASINDEX) &&
                    !(itflags & NPY_ITFLAG_BUFFER)) {
                /*
                 * If RemoveAxis may be called, the size check is delayed
                 * until either the multi index is removed, or GetIterNext
                 * is called.
                 */
                NIT_ITERSIZE(iter) = -1;
                break;
            }
            else {
                PyErr_SetString(PyExc_ValueError, "iterator is too large");
                return 0;
            }
        }
    }
    /* The range defaults to everything */
    NIT_ITERSTART(iter) = 0;
    NIT_ITEREND(iter) = NIT_ITERSIZE(iter);

    return 1;

broadcast_error: {
        npy_intp remdims[NPY_MAXDIMS];

        if (op_axes == NULL) {
            PyObject *shape1 = PyUnicode_FromString("");
            if (shape1 == NULL) {
                return 0;
            }
            for (iop = 0; iop < nop; ++iop) {
                if (op[iop] != NULL) {
                    int ndims = PyArray_NDIM(op[iop]);
                    npy_intp *dims = PyArray_DIMS(op[iop]);
                    PyObject *tmp = convert_shape_to_string(ndims, dims, " ");
                    if (tmp == NULL) {
                        Py_DECREF(shape1);
                        return 0;
                    }
                    Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
                    Py_DECREF(tmp);
                    if (shape1 == NULL) {
                        return 0;
                    }
                }
            }
            if (itershape == NULL) {
                PyErr_Format(PyExc_ValueError,
                        "operands could not be broadcast together with "
                        "shapes %S", shape1);
                Py_DECREF(shape1);
                return 0;
            }
            else {
                PyObject *shape2 = convert_shape_to_string(ndim, itershape, "");
                if (shape2 == NULL) {
                    Py_DECREF(shape1);
                    return 0;
                }
                PyErr_Format(PyExc_ValueError,
                        "operands could not be broadcast together with "
                        "shapes %S and requested shape %S", shape1, shape2);
                Py_DECREF(shape1);
                Py_DECREF(shape2);
                return 0;
            }
        }
        else {
            PyObject *shape1 = PyUnicode_FromString("");
            if (shape1 == NULL) {
                return 0;
            }
            for (iop = 0; iop < nop; ++iop) {
                if (op[iop] != NULL) {
                    int *axes = op_axes[iop];
                    int ndims = PyArray_NDIM(op[iop]);
                    npy_intp *dims = PyArray_DIMS(op[iop]);
                    char *tmpstr = (axes == NULL) ? " " : "->";

                    PyObject *tmp = convert_shape_to_string(ndims, dims, tmpstr);
                    if (tmp == NULL) {
                        Py_DECREF(shape1);
                        return 0;
                    }
                    Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
                    Py_DECREF(tmp);
                    if (shape1 == NULL) {
                        return 0;
                    }

                    if (axes != NULL) {
                        for (idim = 0; idim < ndim; ++idim) {
                            int i = npyiter_get_op_axis(axes[idim], NULL);

                            if (i >= 0 && i < PyArray_NDIM(op[iop])) {
                                remdims[idim] = PyArray_DIM(op[iop], i);
                            }
                            else {
                                remdims[idim] = -1;
                            }
                        }
                        PyObject *tmp = convert_shape_to_string(ndim, remdims, " ");
                        if (tmp == NULL) {
                            Py_DECREF(shape1);
                            return 0;
                        }
                        Py_SETREF(shape1, PyUnicode_Concat(shape1, tmp));
                        Py_DECREF(tmp);
                        if (shape1 == NULL) {
                            return 0;
                        }
                    }
                }
            }
            if (itershape == NULL) {
                PyErr_Format(PyExc_ValueError,
                        "operands could not be broadcast together with "
                        "remapped shapes [original->remapped]: %S", shape1);
                Py_DECREF(shape1);
                return 0;
            }
            else {
                PyObject *shape2 = convert_shape_to_string(ndim, itershape, "");
                if (shape2 == NULL) {
                    Py_DECREF(shape1);
                    return 0;
                }
                PyErr_Format(PyExc_ValueError,
                        "operands could not be broadcast together with "
                        "remapped shapes [original->remapped]: %S and "
                        "requested shape %S", shape1, shape2);
                Py_DECREF(shape1);
                Py_DECREF(shape2);
                return 0;
            }
        }
    }

operand_different_than_broadcast: {
        /* operand shape */
        int ndims = PyArray_NDIM(op[iop]);
        npy_intp *dims = PyArray_DIMS(op[iop]);
        PyObject *shape1 = convert_shape_to_string(ndims, dims, "");
        if (shape1 == NULL) {
            return 0;
        }

        /* Broadcast shape */
        PyObject *shape2 = convert_shape_to_string(ndim, broadcast_shape, "");
        if (shape2 == NULL) {
            Py_DECREF(shape1);
            return 0;
        }

        if (op_axes == NULL || op_axes[iop] == NULL) {
            /* operand shape not remapped */

            if (op_flags[iop] & NPY_ITER_READONLY) {
                PyErr_Format(PyExc_ValueError,
                    "non-broadcastable operand with shape %S doesn't "
                    "match the broadcast shape %S", shape1, shape2);
            }
            else {
                PyErr_Format(PyExc_ValueError,
                    "non-broadcastable output operand with shape %S doesn't "
                    "match the broadcast shape %S", shape1, shape2);
            }
            Py_DECREF(shape1);
            Py_DECREF(shape2);
            return 0;
        }
        else {
            /* operand shape remapped */

            npy_intp remdims[NPY_MAXDIMS];
            int *axes = op_axes[iop];
            for (idim = 0; idim < ndim; ++idim) {
                npy_intp i = axes[ndim - idim - 1];
                if (i >= 0 && i < PyArray_NDIM(op[iop])) {
                    remdims[idim] = PyArray_DIM(op[iop], i);
                }
                else {
                    remdims[idim] = -1;
                }
            }

            PyObject *shape3 = convert_shape_to_string(ndim, remdims, "");
            if (shape3 == NULL) {
                Py_DECREF(shape1);
                Py_DECREF(shape2);
                return 0;
            }

            if (op_flags[iop] & NPY_ITER_READONLY) {
                PyErr_Format(PyExc_ValueError,
                    "non-broadcastable operand with shape %S "
                    "[remapped to %S] doesn't match the broadcast shape %S",
                    shape1, shape3, shape2);
            }
            else {
                PyErr_Format(PyExc_ValueError,
                    "non-broadcastable output operand with shape %S "
                    "[remapped to %S] doesn't match the broadcast shape %S",
                    shape1, shape3, shape2);
            }
            Py_DECREF(shape1);
            Py_DECREF(shape2);
            Py_DECREF(shape3);
            return 0;
        }
    }
}

/*
 * Replaces the AXISDATA for the iop'th operand, broadcasting
 * the dimensions as necessary.  Assumes the replacement array is
 * exactly the same shape as the original array used when
 * npy_fill_axisdata was called.
 *
 * If op_axes is not NULL, it should point to an ndim-sized
 * array.
 */
static void
npyiter_replace_axisdata(
        NpyIter *iter, int iop, PyArrayObject *op,
        int orig_op_ndim, const int *op_axes)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int nop = NIT_NOP(iter);
    char *op_dataptr = PyArray_DATA(op);

    NpyIter_AxisData *axisdata0, *axisdata;
    npy_intp sizeof_axisdata;
    npy_int8 *perm;
    npy_intp baseoffset = 0;

    perm = NIT_PERM(iter);
    axisdata0 = NIT_AXISDATA(iter);
    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);

    /*
     * Replace just the strides which were non-zero, and compute
     * the base data address.
     */
    axisdata = axisdata0;

    if (op_axes != NULL) {
        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
            int i;
            npy_bool axis_flipped;
            npy_intp shape;

            /* Apply perm to get the original axis, and check if its flipped */
            i = npyiter_undo_iter_axis_perm(idim, ndim, perm, &axis_flipped);

            i = npyiter_get_op_axis(op_axes[i], NULL);
            assert(i < orig_op_ndim);
            if (i >= 0) {
                shape = PyArray_DIM(op, i);
                if (shape != 1) {
                    npy_intp stride = PyArray_STRIDE(op, i);
                    if (axis_flipped) {
                        NAD_STRIDES(axisdata)[iop] = -stride;
                        baseoffset += stride*(shape-1);
                    }
                    else {
                        NAD_STRIDES(axisdata)[iop] = stride;
                    }
                }
            }
        }
    }
    else {
        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
            int i;
            npy_bool axis_flipped;
            npy_intp shape;

            i = npyiter_undo_iter_axis_perm(
                    idim, orig_op_ndim, perm, &axis_flipped);

            if (i >= 0) {
                shape = PyArray_DIM(op, i);
                if (shape != 1) {
                    npy_intp stride = PyArray_STRIDE(op, i);
                    if (axis_flipped) {
                        NAD_STRIDES(axisdata)[iop] = -stride;
                        baseoffset += stride*(shape-1);
                    }
                    else {
                        NAD_STRIDES(axisdata)[iop] = stride;
                    }
                }
            }
        }
    }

    op_dataptr += baseoffset;

    /* Now the base data pointer is calculated, set it everywhere it's needed */
    NIT_RESETDATAPTR(iter)[iop] = op_dataptr;
    NIT_BASEOFFSETS(iter)[iop] = baseoffset;
    axisdata = axisdata0;
    /* Fill at least one axisdata, for the 0-d case */
    NAD_PTRS(axisdata)[iop] = op_dataptr;
    NIT_ADVANCE_AXISDATA(axisdata, 1);
    for (idim = 1; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
        NAD_PTRS(axisdata)[iop] = op_dataptr;
    }
}

/*
 * Computes the iterator's index strides and initializes the index values
 * to zero.
 *
 * This must be called before the axes (i.e. the AXISDATA array) may
 * be reordered.
 */
static void
npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int nop = NIT_NOP(iter);

    npy_intp indexstride;
    NpyIter_AxisData *axisdata;
    npy_intp sizeof_axisdata;

    /*
     * If there is only one element being iterated, we just have
     * to touch the first AXISDATA because nothing will ever be
     * incremented. This also initializes the data for the 0-d case.
     */
    if (NIT_ITERSIZE(iter) == 1) {
        if (itflags & NPY_ITFLAG_HASINDEX) {
            axisdata = NIT_AXISDATA(iter);
            NAD_PTRS(axisdata)[nop] = 0;
        }
        return;
    }

    if (flags & NPY_ITER_C_INDEX) {
        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
        axisdata = NIT_AXISDATA(iter);
        indexstride = 1;
        for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
            npy_intp shape = NAD_SHAPE(axisdata);

            if (shape == 1) {
                NAD_STRIDES(axisdata)[nop] = 0;
            }
            else {
                NAD_STRIDES(axisdata)[nop] = indexstride;
            }
            NAD_PTRS(axisdata)[nop] = 0;
            indexstride *= shape;
        }
    }
    else if (flags & NPY_ITER_F_INDEX) {
        sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
        axisdata = NIT_INDEX_AXISDATA(NIT_AXISDATA(iter), ndim-1);
        indexstride = 1;
        for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, -1)) {
            npy_intp shape = NAD_SHAPE(axisdata);

            if (shape == 1) {
                NAD_STRIDES(axisdata)[nop] = 0;
            }
            else {
                NAD_STRIDES(axisdata)[nop] = indexstride;
            }
            NAD_PTRS(axisdata)[nop] = 0;
            indexstride *= shape;
        }
    }
}

/*
 * If the order is NPY_KEEPORDER, lets the iterator find the best
 * iteration order, otherwise forces it.  Indicates in the itflags that
 * whether the iteration order was forced.
 */
static void
npyiter_apply_forced_iteration_order(NpyIter *iter, NPY_ORDER order)
{
    /*npy_uint32 itflags = NIT_ITFLAGS(iter);*/
    int ndim = NIT_NDIM(iter);
    int iop, nop = NIT_NOP(iter);

    switch (order) {
    case NPY_CORDER:
        NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER;
        break;
    case NPY_FORTRANORDER:
        NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER;
        /* Only need to actually do something if there is more than 1 dim */
        if (ndim > 1) {
            npyiter_reverse_axis_ordering(iter);
        }
        break;
    case NPY_ANYORDER:
        NIT_ITFLAGS(iter) |= NPY_ITFLAG_FORCEDORDER;
        /* Only need to actually do something if there is more than 1 dim */
        if (ndim > 1) {
            PyArrayObject **op = NIT_OPERANDS(iter);
            int forder = 1;

            /* Check that all the array inputs are fortran order */
            for (iop = 0; iop < nop; ++iop, ++op) {
                if (*op && !PyArray_CHKFLAGS(*op, NPY_ARRAY_F_CONTIGUOUS)) {
                    forder = 0;
                    break;
                }
            }

            if (forder) {
                npyiter_reverse_axis_ordering(iter);
            }
        }
        break;
    case NPY_KEEPORDER:
        /* Don't set the forced order flag here... */
        break;
    }
}

/*
 * This function negates any strides in the iterator
 * which are negative.  When iterating more than one
 * object, it only flips strides when they are all
 * negative or zero.
 */
static void
npyiter_flip_negative_strides(NpyIter *iter)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int iop, nop = NIT_NOP(iter);

    npy_intp istrides, nstrides = NAD_NSTRIDES();
    NpyIter_AxisData *axisdata, *axisdata0;
    npy_intp *baseoffsets;
    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
    int any_flipped = 0;

    axisdata0 = axisdata = NIT_AXISDATA(iter);
    baseoffsets = NIT_BASEOFFSETS(iter);
    for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
        npy_intp *strides = NAD_STRIDES(axisdata);
        int any_negative = 0;

        /*
         * Check the signs of all the operand strides.
         */
        for (iop = 0; iop < nop; ++iop) {
            if (strides[iop] < 0) {
                any_negative = 1;
            }
            else if (strides[iop] != 0) {
                break;
            }
        }
        /*
         * If at least one stride is negative and none are positive,
         * flip all the strides for this dimension.
         */
        if (any_negative && iop == nop) {
            npy_intp shapem1 = NAD_SHAPE(axisdata) - 1;

            for (istrides = 0; istrides < nstrides; ++istrides) {
                npy_intp stride = strides[istrides];

                /* Adjust the base pointers to start at the end */
                baseoffsets[istrides] += shapem1 * stride;
                /* Flip the stride */
                strides[istrides] = -stride;
            }
            /*
             * Make the perm entry negative so get_multi_index
             * knows it's flipped
             */
            NIT_PERM(iter)[idim] = -1-NIT_PERM(iter)[idim];

            any_flipped = 1;
        }
    }

    /*
     * If any strides were flipped, the base pointers were adjusted
     * in the first AXISDATA, and need to be copied to all the rest
     */
    if (any_flipped) {
        char **resetdataptr = NIT_RESETDATAPTR(iter);

        for (istrides = 0; istrides < nstrides; ++istrides) {
            resetdataptr[istrides] += baseoffsets[istrides];
        }
        axisdata = axisdata0;
        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
            char **ptrs = NAD_PTRS(axisdata);
            for (istrides = 0; istrides < nstrides; ++istrides) {
                ptrs[istrides] = resetdataptr[istrides];
            }
        }
        /*
         * Indicate that some of the perm entries are negative,
         * and that it's not (strictly speaking) the identity perm.
         */
        NIT_ITFLAGS(iter) = (NIT_ITFLAGS(iter)|NPY_ITFLAG_NEGPERM) &
                            ~NPY_ITFLAG_IDENTPERM;
    }
}

static void
npyiter_reverse_axis_ordering(NpyIter *iter)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int ndim = NIT_NDIM(iter);
    int nop = NIT_NOP(iter);

    npy_intp i, temp, size;
    npy_intp *first, *last;
    npy_int8 *perm;

    size = NIT_AXISDATA_SIZEOF(itflags, ndim, nop)/NPY_SIZEOF_INTP;
    first = (npy_intp*)NIT_AXISDATA(iter);
    last = first + (ndim-1)*size;

    /* This loop reverses the order of the AXISDATA array */
    while (first < last) {
        for (i = 0; i < size; ++i) {
            temp = first[i];
            first[i] = last[i];
            last[i] = temp;
        }
        first += size;
        last -= size;
    }

    /* Store the perm we applied */
    perm = NIT_PERM(iter);
    for(i = ndim-1; i >= 0; --i, ++perm) {
        *perm = (npy_int8)i;
    }

    NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
}

static inline npy_intp
intp_abs(npy_intp x)
{
    return (x < 0) ? -x : x;
}

static void
npyiter_find_best_axis_ordering(NpyIter *iter)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int iop, nop = NIT_NOP(iter);

    npy_intp ax_i0, ax_i1, ax_ipos;
    npy_int8 ax_j0, ax_j1;
    npy_int8 *perm;
    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
    npy_intp sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
    int permuted = 0;

    perm = NIT_PERM(iter);

    /*
     * Do a custom stable insertion sort.  Note that because
     * the AXISDATA has been reversed from C order, this
     * is sorting from smallest stride to biggest stride.
     */
    for (ax_i0 = 1; ax_i0 < ndim; ++ax_i0) {
        npy_intp *strides0;

        /* 'ax_ipos' is where perm[ax_i0] will get inserted */
        ax_ipos = ax_i0;
        ax_j0 = perm[ax_i0];

        strides0 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, ax_j0));
        for (ax_i1 = ax_i0-1; ax_i1 >= 0; --ax_i1) {
            int ambig = 1, shouldswap = 0;
            npy_intp *strides1;

            ax_j1 = perm[ax_i1];

            strides1 = NAD_STRIDES(NIT_INDEX_AXISDATA(axisdata, ax_j1));

            for (iop = 0; iop < nop; ++iop) {
                if (strides0[iop] != 0 && strides1[iop] != 0) {
                    if (intp_abs(strides1[iop]) <=
                                            intp_abs(strides0[iop])) {
                        /*
                         * Set swap even if it's not ambiguous already,
                         * because in the case of conflicts between
                         * different operands, C-order wins.
                         */
                        shouldswap = 0;
                    }
                    else {
                        /* Only set swap if it's still ambiguous */
                        if (ambig) {
                            shouldswap = 1;
                        }
                    }

                    /*
                     * A comparison has been done, so it's
                     * no longer ambiguous
                     */
                    ambig = 0;
                }
            }
            /*
             * If the comparison was unambiguous, either shift
             * 'ax_ipos' to 'ax_i1' or stop looking for an insertion
             * point
             */
            if (!ambig) {
                if (shouldswap) {
                    ax_ipos = ax_i1;
                }
                else {
                    break;
                }
            }
        }

        /* Insert perm[ax_i0] into the right place */
        if (ax_ipos != ax_i0) {
            for (ax_i1 = ax_i0; ax_i1 > ax_ipos; --ax_i1) {
                perm[ax_i1] = perm[ax_i1-1];
            }
            perm[ax_ipos] = ax_j0;
            permuted = 1;
        }
    }

    /* Apply the computed permutation to the AXISDATA array */
    if (permuted == 1) {
        npy_intp i, size = sizeof_axisdata/NPY_SIZEOF_INTP;
        NpyIter_AxisData *ad_i;

        /* Use the index as a flag, set each to 1 */
        ad_i = axisdata;
        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(ad_i, 1)) {
            NAD_INDEX(ad_i) = 1;
        }
        /* Apply the permutation by following the cycles */
        for (idim = 0; idim < ndim; ++idim) {
            ad_i = NIT_INDEX_AXISDATA(axisdata, idim);

            /* If this axis hasn't been touched yet, process it */
            if (NAD_INDEX(ad_i) == 1) {
                npy_int8 pidim = perm[idim];
                npy_intp tmp;
                NpyIter_AxisData *ad_p, *ad_q;

                if (pidim != idim) {
                    /* Follow the cycle, copying the data */
                    for (i = 0; i < size; ++i) {
                        pidim = perm[idim];
                        ad_q = ad_i;
                        tmp = *((npy_intp*)ad_q + i);
                        while (pidim != idim) {
                            ad_p = NIT_INDEX_AXISDATA(axisdata, pidim);
                            *((npy_intp*)ad_q + i) = *((npy_intp*)ad_p + i);

                            ad_q = ad_p;
                            pidim = perm[(int)pidim];
                        }
                        *((npy_intp*)ad_q + i) = tmp;
                    }
                    /* Follow the cycle again, marking it as done */
                    pidim = perm[idim];

                    while (pidim != idim) {
                        NAD_INDEX(NIT_INDEX_AXISDATA(axisdata, pidim)) = 0;
                        pidim = perm[(int)pidim];
                    }
                }
                NAD_INDEX(ad_i) = 0;
            }
        }
        /* Clear the identity perm flag */
        NIT_ITFLAGS(iter) &= ~NPY_ITFLAG_IDENTPERM;
    }
}

/*
 * Calculates a dtype that all the types can be promoted to, using the
 * ufunc rules.  If only_inputs is 1, it leaves any operands that
 * are not read from out of the calculation.
 */
static PyArray_Descr *
npyiter_get_common_dtype(int nop, PyArrayObject **op,
                        const npyiter_opitflags *op_itflags, PyArray_Descr **op_dtype,
                        PyArray_Descr **op_request_dtypes,
                        int only_inputs)
{
    int iop;
    npy_intp narrs = 0, ndtypes = 0;
    PyArrayObject *arrs[NPY_MAXARGS];
    PyArray_Descr *dtypes[NPY_MAXARGS];
    PyArray_Descr *ret;

    NPY_IT_DBG_PRINT("Iterator: Getting a common data type from operands\n");

    for (iop = 0; iop < nop; ++iop) {
        if (op_dtype[iop] != NULL &&
                    (!only_inputs || (op_itflags[iop] & NPY_OP_ITFLAG_READ))) {
            /* If no dtype was requested and the op is a scalar, pass the op */
            if ((op_request_dtypes == NULL ||
                            op_request_dtypes[iop] == NULL) &&
                                            PyArray_NDIM(op[iop]) == 0) {
                arrs[narrs++] = op[iop];
            }
            /* Otherwise just pass in the dtype */
            else {
                dtypes[ndtypes++] = op_dtype[iop];
            }
        }
    }

    if (narrs == 0) {
        npy_intp i;
        ret = dtypes[0];
        for (i = 1; i < ndtypes; ++i) {
            if (ret != dtypes[i])
                break;
        }
        if (i == ndtypes) {
            if (ndtypes == 1 || PyArray_ISNBO(ret->byteorder)) {
                Py_INCREF(ret);
            }
            else {
                ret = PyArray_DescrNewByteorder(ret, NPY_NATIVE);
            }
        }
        else {
            ret = PyArray_ResultType(narrs, arrs, ndtypes, dtypes);
        }
    }
    else {
        ret = PyArray_ResultType(narrs, arrs, ndtypes, dtypes);
    }

    return ret;
}

/*
 * Allocates a temporary array which can be used to replace op
 * in the iteration.  Its dtype will be op_dtype.
 *
 * The result array has a memory ordering which matches the iterator,
 * which may or may not match that of op.  The parameter 'shape' may be
 * NULL, in which case it is filled in from the iterator's shape.
 *
 * This function must be called before any axes are coalesced.
 */
static PyArrayObject *
npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                npy_uint32 flags, npyiter_opitflags *op_itflags,
                int op_ndim, npy_intp const *shape,
                PyArray_Descr *op_dtype, const int *op_axes)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int used_op_ndim;
    int nop = NIT_NOP(iter);

    npy_int8 *perm = NIT_PERM(iter);
    npy_intp new_shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
    npy_intp stride = op_dtype->elsize;
    NpyIter_AxisData *axisdata;
    npy_intp sizeof_axisdata;
    int i;

    PyArrayObject *ret;

    /*
     * There is an interaction with array-dtypes here, which
     * generally works. Let's say you make an nditer with an
     * output dtype of a 2-double array. All-scalar inputs
     * will result in a 1-dimensional output with shape (2).
     * Everything still works out in the nditer, because the
     * new dimension is always added on the end, and it cares
     * about what happens at the beginning.
     */

    /* If it's a scalar, don't need to check the axes */
    if (op_ndim == 0) {
        Py_INCREF(op_dtype);
        ret = (PyArrayObject *)PyArray_NewFromDescr(subtype, op_dtype, 0,
                               NULL, NULL, NULL, 0, NULL);

        return ret;
    }

    axisdata = NIT_AXISDATA(iter);
    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);

    /* Initialize the strides to invalid values */
    for (i = 0; i < op_ndim; ++i) {
        strides[i] = NPY_MAX_INTP;
    }

    if (op_axes != NULL) {
        used_op_ndim = 0;
        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
            npy_bool reduction_axis;

            /* Apply the perm to get the original axis */
            i = npyiter_undo_iter_axis_perm(idim, ndim, perm, NULL);
            i = npyiter_get_op_axis(op_axes[i], &reduction_axis);

            /*
             * If i < 0, this is a new axis (the operand does not have it)
             * so we can ignore it here.  The iterator setup will have
             * ensured already that a potential reduction/broadcast is valid.
             */
            if (i >= 0) {
                NPY_IT_DBG_PRINT3("Iterator: Setting allocated stride %d "
                                    "for iterator dimension %d to %d\n", (int)i,
                                    (int)idim, (int)stride);
                used_op_ndim += 1;
                strides[i] = stride;
                if (shape == NULL) {
                    if (reduction_axis) {
                        /* reduction axes always have a length of 1 */
                        new_shape[i] = 1;
                    }
                    else {
                        new_shape[i] = NAD_SHAPE(axisdata);
                    }
                    stride *= new_shape[i];
                    if (i >= ndim) {
                        PyErr_Format(PyExc_ValueError,
                                "automatically allocated output array "
                                "specified with an inconsistent axis mapping; "
                                "the axis mapping cannot include dimension %d "
                                "which is too large for the iterator dimension "
                                "of %d.", i, ndim);
                        return NULL;
                    }
                }
                else {
                    assert(!reduction_axis || shape[i] == 1);
                    stride *= shape[i];
                }
            }
        }
    }
    else {
        used_op_ndim = ndim;
        for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
            /* Apply the perm to get the original axis */
            i = npyiter_undo_iter_axis_perm(idim, op_ndim, perm, NULL);

            if (i >= 0) {
                NPY_IT_DBG_PRINT3("Iterator: Setting allocated stride %d "
                                    "for iterator dimension %d to %d\n", (int)i,
                                    (int)idim, (int)stride);
                strides[i] = stride;
                if (shape == NULL) {
                    new_shape[i] = NAD_SHAPE(axisdata);
                    stride *= new_shape[i];
                }
                else {
                    stride *= shape[i];
                }
            }
        }
    }

    if (shape == NULL) {
        /* If shape was NULL, use the shape we calculated */
        op_ndim = used_op_ndim;
        shape = new_shape;
        /*
         * If there's a gap in the array's dimensions, it's an error.
         * For instance, if op_axes [0, 2] is specified, there will a place
         * in the strides array where the value is not set.
         */
        for (i = 0; i < op_ndim; i++) {
            if (strides[i] == NPY_MAX_INTP) {
                PyErr_Format(PyExc_ValueError,
                        "automatically allocated output array "
                        "specified with an inconsistent axis mapping; "
                        "the axis mapping is missing an entry for "
                        "dimension %d.", i);
                return NULL;
            }
        }
    }
    else if (used_op_ndim < op_ndim) {
        /*
         * If custom axes were specified, some dimensions may not have
         * been used. These are additional axes which are ignored in the
         * iterator but need to be handled here.
         */
        npy_intp factor, itemsize, new_strides[NPY_MAXDIMS];

        /* Fill in the missing strides in C order */
        factor = 1;
        itemsize = op_dtype->elsize;
        for (i = op_ndim-1; i >= 0; --i) {
            if (strides[i] == NPY_MAX_INTP) {
                new_strides[i] = factor * itemsize;
                factor *= shape[i];
            }
        }

        /*
         * Copy the missing strides, and multiply the existing strides
         * by the calculated factor.  This way, the missing strides
         * are tighter together in memory, which is good for nested
         * loops.
         */
        for (i = 0; i < op_ndim; ++i) {
            if (strides[i] == NPY_MAX_INTP) {
                strides[i] = new_strides[i];
            }
            else {
                strides[i] *= factor;
            }
        }
    }

    /* Allocate the temporary array */
    Py_INCREF(op_dtype);
    ret = (PyArrayObject *)PyArray_NewFromDescr(subtype, op_dtype, op_ndim,
                               shape, strides, NULL, 0, NULL);
    if (ret == NULL) {
        return NULL;
    }

    /* Double-check that the subtype didn't mess with the dimensions */
    if (subtype != &PyArray_Type) {
        /*
         * TODO: the dtype could have a subarray, which adds new dimensions
         *       to `ret`, that should typically be fine, but will break
         *       in this branch.
         */
        if (PyArray_NDIM(ret) != op_ndim ||
                    !PyArray_CompareLists(shape, PyArray_DIMS(ret), op_ndim)) {
            PyErr_SetString(PyExc_RuntimeError,
                    "Iterator automatic output has an array subtype "
                    "which changed the dimensions of the output");
            Py_DECREF(ret);
            return NULL;
        }
    }

    return ret;
}

static int
npyiter_allocate_arrays(NpyIter *iter,
                        npy_uint32 flags,
                        PyArray_Descr **op_dtype, PyTypeObject *subtype,
                        const npy_uint32 *op_flags, npyiter_opitflags *op_itflags,
                        int **op_axes)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    int idim, ndim = NIT_NDIM(iter);
    int iop, nop = NIT_NOP(iter);

    int check_writemasked_reductions = 0;

    NpyIter_BufferData *bufferdata = NULL;
    PyArrayObject **op = NIT_OPERANDS(iter);

    if (itflags & NPY_ITFLAG_BUFFER) {
        bufferdata = NIT_BUFFERDATA(iter);
    }

    if (flags & NPY_ITER_COPY_IF_OVERLAP) {
        /*
         * Perform operand memory overlap checks, if requested.
         *
         * If any write operand has memory overlap with any read operand,
         * eliminate all overlap by making temporary copies, by enabling
         * NPY_OP_ITFLAG_FORCECOPY for the write operand to force WRITEBACKIFCOPY.
         *
         * Operands with NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE enabled are not
         * considered overlapping if the arrays are exactly the same. In this
         * case, the iterator loops through them in the same order element by
         * element.  (As usual, the user-provided inner loop is assumed to be
         * able to deal with this level of simple aliasing.)
         */
        for (iop = 0; iop < nop; ++iop) {
            int may_share_memory = 0;
            int iother;

            if (op[iop] == NULL) {
                /* Iterator will always allocate */
                continue;
            }

            if (!(op_itflags[iop] & NPY_OP_ITFLAG_WRITE)) {
                /*
                 * Copy output operands only, not inputs.
                 * A more sophisticated heuristic could be
                 * substituted here later.
                 */
                continue;
            }

            for (iother = 0; iother < nop; ++iother) {
                if (iother == iop || op[iother] == NULL) {
                    continue;
                }

                if (!(op_itflags[iother] & NPY_OP_ITFLAG_READ)) {
                    /* No data dependence for arrays not read from */
                    continue;
                }

                if (op_itflags[iother] & NPY_OP_ITFLAG_FORCECOPY) {
                    /* Already copied */
                    continue;
                }

                /*
                 * If the arrays are views to exactly the same data, no need
                 * to make copies, if the caller (eg ufunc) says it accesses
                 * data only in the iterator order.
                 *
                 * However, if there is internal overlap (e.g. a zero stride on
                 * a non-unit dimension), a copy cannot be avoided.
                 */
                if ((op_flags[iop] & NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE) &&
                    (op_flags[iother] & NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE) &&
                    PyArray_BYTES(op[iop]) == PyArray_BYTES(op[iother]) &&
                    PyArray_NDIM(op[iop]) == PyArray_NDIM(op[iother]) &&
                    PyArray_CompareLists(PyArray_DIMS(op[iop]),
                                         PyArray_DIMS(op[iother]),
                                         PyArray_NDIM(op[iop])) &&
                    PyArray_CompareLists(PyArray_STRIDES(op[iop]),
                                         PyArray_STRIDES(op[iother]),
                                         PyArray_NDIM(op[iop])) &&
                    PyArray_DESCR(op[iop]) == PyArray_DESCR(op[iother]) &&
                    solve_may_have_internal_overlap(op[iop], 1) == 0) {

                    continue;
                }

                /*
                 * Use max work = 1. If the arrays are large, it might
                 * make sense to go further.
                 */
                may_share_memory = solve_may_share_memory(op[iop],
                                                          op[iother],
                                                          1);

                if (may_share_memory) {
                    op_itflags[iop] |= NPY_OP_ITFLAG_FORCECOPY;
                    break;
                }
            }
        }
    }

    for (iop = 0; iop < nop; ++iop) {
        /*
         * Check whether there are any WRITEMASKED REDUCE operands
         * which should be validated after all the strides are filled
         * in.
         */
        if ((op_itflags[iop] &
                (NPY_OP_ITFLAG_WRITEMASKED | NPY_OP_ITFLAG_REDUCE)) ==
                        (NPY_OP_ITFLAG_WRITEMASKED | NPY_OP_ITFLAG_REDUCE)) {
            check_writemasked_reductions = 1;
        }

        /* NULL means an output the iterator should allocate */
        if (op[iop] == NULL) {
            PyArrayObject *out;
            PyTypeObject *op_subtype;

            /* Check whether the subtype was disabled */
            op_subtype = (op_flags[iop] & NPY_ITER_NO_SUBTYPE) ?
                                                &PyArray_Type : subtype;

            /*
             * Allocate the output array.
             *
             * Note that here, ndim is always correct if no op_axes was given
             * (but the actual dimension of op can be larger). If op_axes
             * is given, ndim is not actually used.
             */
            out = npyiter_new_temp_array(iter, op_subtype,
                                        flags, &op_itflags[iop],
                                        ndim,
                                        NULL,
                                        op_dtype[iop],
                                        op_axes ? op_axes[iop] : NULL);
            if (out == NULL) {
                return 0;
            }

            op[iop] = out;

            /*
             * Now we need to replace the pointers and strides with values
             * from the new array.
             */
            npyiter_replace_axisdata(iter, iop, op[iop], ndim,
                    op_axes ? op_axes[iop] : NULL);

            /*
             * New arrays are guaranteed true-aligned, but copy/cast code
             * needs uint-alignment in addition.
             */
            if (IsUintAligned(out)) {
                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
            }
            /* New arrays need no cast */
            op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
        }
        /*
         * If casting is required, the operand is read-only, and
         * it's an array scalar, make a copy whether or not the
         * copy flag is enabled.
         */
        else if ((op_itflags[iop] & (NPY_OP_ITFLAG_CAST |
                         NPY_OP_ITFLAG_READ |
                         NPY_OP_ITFLAG_WRITE)) == (NPY_OP_ITFLAG_CAST |
                                                   NPY_OP_ITFLAG_READ) &&
                          PyArray_NDIM(op[iop]) == 0) {
            PyArrayObject *temp;
            Py_INCREF(op_dtype[iop]);
            temp = (PyArrayObject *)PyArray_NewFromDescr(
                                        &PyArray_Type, op_dtype[iop],
                                        0, NULL, NULL, NULL, 0, NULL);
            if (temp == NULL) {
                return 0;
            }
            if (PyArray_CopyInto(temp, op[iop]) != 0) {
                Py_DECREF(temp);
                return 0;
            }
            Py_DECREF(op[iop]);
            op[iop] = temp;

            /*
             * Now we need to replace the pointers and strides with values
             * from the temporary array.
             */
            npyiter_replace_axisdata(iter, iop, op[iop], 0, NULL);

            /*
             * New arrays are guaranteed true-aligned, but copy/cast code
             * needs uint-alignment in addition.
             */
            if (IsUintAligned(temp)) {
                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
            }
            /*
             * New arrays need no cast, and in the case
             * of scalars, always have stride 0 so never need buffering
             */
            op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
            op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
            if (itflags & NPY_ITFLAG_BUFFER) {
                NBF_STRIDES(bufferdata)[iop] = 0;
            }
        }
        /*
         * Make a temporary copy if,
         * 1. If casting is required and permitted, or,
         * 2. If force-copy is requested
         */
        else if (((op_itflags[iop] & NPY_OP_ITFLAG_CAST) &&
                        (op_flags[iop] &
                        (NPY_ITER_COPY|NPY_ITER_UPDATEIFCOPY))) ||
                 (op_itflags[iop] & NPY_OP_ITFLAG_FORCECOPY)) {
            PyArrayObject *temp;
            int ondim = PyArray_NDIM(op[iop]);

            /* Allocate the temporary array, if possible */
            temp = npyiter_new_temp_array(iter, &PyArray_Type,
                                        flags, &op_itflags[iop],
                                        ondim,
                                        PyArray_DIMS(op[iop]),
                                        op_dtype[iop],
                                        op_axes ? op_axes[iop] : NULL);
            if (temp == NULL) {
                return 0;
            }

            /*
             * If the data will be read, copy it into temp.
             * TODO: It might be possible to do a view into
             *       op[iop]'s mask instead here.
             */
            if (op_itflags[iop] & NPY_OP_ITFLAG_READ) {
                if (PyArray_CopyInto(temp, op[iop]) != 0) {
                    Py_DECREF(temp);
                    return 0;
                }
            }
            /* If the data will be written to, set WRITEBACKIFCOPY
               and require a context manager */
            if (op_itflags[iop] & NPY_OP_ITFLAG_WRITE) {
                Py_INCREF(op[iop]);
                if (PyArray_SetWritebackIfCopyBase(temp, op[iop]) < 0) {
                    Py_DECREF(temp);
                    return 0;
                }
                op_itflags[iop] |= NPY_OP_ITFLAG_HAS_WRITEBACK;
            }

            Py_DECREF(op[iop]);
            op[iop] = temp;

            /*
             * Now we need to replace the pointers and strides with values
             * from the temporary array.
             */
            npyiter_replace_axisdata(iter, iop, op[iop], ondim,
                    op_axes ? op_axes[iop] : NULL);

            /*
             * New arrays are guaranteed true-aligned, but copy/cast code
             * additionally needs uint-alignment in addition.
             */
            if (IsUintAligned(temp)) {
                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
            }
            /* The temporary copy needs no cast */
            op_itflags[iop] &= ~NPY_OP_ITFLAG_CAST;
        }
        else {
            /*
             * Buffering must be enabled for casting/conversion if copy
             * wasn't specified.
             */
            if ((op_itflags[iop] & NPY_OP_ITFLAG_CAST) &&
                                  !(itflags & NPY_ITFLAG_BUFFER)) {
                PyErr_SetString(PyExc_TypeError,
                        "Iterator operand required copying or buffering, "
                        "but neither copying nor buffering was enabled");
                return 0;
            }

            /*
             * If the operand is aligned, any buffering can use aligned
             * optimizations.
             */
            if (IsUintAligned(op[iop])) {
                op_itflags[iop] |= NPY_OP_ITFLAG_ALIGNED;
            }
        }

        /* Here we can finally check for contiguous iteration */
        if (op_flags[iop] & NPY_ITER_CONTIG) {
            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
            npy_intp stride = NAD_STRIDES(axisdata)[iop];

            if (stride != op_dtype[iop]->elsize) {
                NPY_IT_DBG_PRINT("Iterator: Setting NPY_OP_ITFLAG_CAST "
                                    "because of NPY_ITER_CONTIG\n");
                op_itflags[iop] |= NPY_OP_ITFLAG_CAST;
                if (!(itflags & NPY_ITFLAG_BUFFER)) {
                    PyErr_SetString(PyExc_TypeError,
                            "Iterator operand required buffering, "
                            "to be contiguous as requested, but "
                            "buffering is not enabled");
                    return 0;
                }
            }
        }

        /*
         * If no alignment, byte swap, or casting is needed,
         * the inner stride of this operand works for the whole
         * array, we can set NPY_OP_ITFLAG_BUFNEVER.
         */
        if ((itflags & NPY_ITFLAG_BUFFER) &&
                                !(op_itflags[iop] & NPY_OP_ITFLAG_CAST)) {
            NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
            if (ndim <= 1) {
                op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
                NBF_STRIDES(bufferdata)[iop] = NAD_STRIDES(axisdata)[iop];
            }
            else if (PyArray_NDIM(op[iop]) > 0) {
                npy_intp stride, shape, innerstride = 0, innershape;
                npy_intp sizeof_axisdata =
                                    NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
                /* Find stride of the first non-empty shape */
                for (idim = 0; idim < ndim; ++idim) {
                    innershape = NAD_SHAPE(axisdata);
                    if (innershape != 1) {
                        innerstride = NAD_STRIDES(axisdata)[iop];
                        break;
                    }
                    NIT_ADVANCE_AXISDATA(axisdata, 1);
                }
                ++idim;
                NIT_ADVANCE_AXISDATA(axisdata, 1);
                /* Check that everything could have coalesced together */
                for (; idim < ndim; ++idim) {
                    stride = NAD_STRIDES(axisdata)[iop];
                    shape = NAD_SHAPE(axisdata);
                    if (shape != 1) {
                        /*
                         * If N times the inner stride doesn't equal this
                         * stride, the multi-dimensionality is needed.
                         */
                        if (innerstride*innershape != stride) {
                            break;
                        }
                        else {
                            innershape *= shape;
                        }
                    }
                    NIT_ADVANCE_AXISDATA(axisdata, 1);
                }
                /*
                 * If we looped all the way to the end, one stride works.
                 * Set that stride, because it may not belong to the first
                 * dimension.
                 */
                if (idim == ndim) {
                    op_itflags[iop] |= NPY_OP_ITFLAG_BUFNEVER;
                    NBF_STRIDES(bufferdata)[iop] = innerstride;
                }
            }
        }
    }

    if (check_writemasked_reductions) {
        for (iop = 0; iop < nop; ++iop) {
            /*
             * Check whether there are any WRITEMASKED REDUCE operands
             * which should be validated now that all the strides are filled
             * in.
             */
            if ((op_itflags[iop] &
                    (NPY_OP_ITFLAG_WRITEMASKED | NPY_OP_ITFLAG_REDUCE)) ==
                        (NPY_OP_ITFLAG_WRITEMASKED | NPY_OP_ITFLAG_REDUCE)) {
                /*
                 * If the ARRAYMASK has 'bigger' dimensions
                 * than this REDUCE WRITEMASKED operand,
                 * the result would be more than one mask
                 * value per reduction element, something which
                 * is invalid. This function provides validation
                 * for that.
                 */
                if (!check_mask_for_writemasked_reduction(iter, iop)) {
                    return 0;
                }
            }
        }
    }

    return 1;
}

/*
 * The __array_priority__ attribute of the inputs determines
 * the subtype of any output arrays.  This function finds the
 * subtype of the input array with highest priority.
 */
static void
npyiter_get_priority_subtype(int nop, PyArrayObject **op,
                            const npyiter_opitflags *op_itflags,
                            double *subtype_priority,
                            PyTypeObject **subtype)
{
    int iop;

    for (iop = 0; iop < nop; ++iop) {
        if (op[iop] != NULL && op_itflags[iop] & NPY_OP_ITFLAG_READ) {
            double priority = PyArray_GetPriority((PyObject *)op[iop], 0.0);
            if (priority > *subtype_priority) {
                *subtype_priority = priority;
                *subtype = Py_TYPE(op[iop]);
            }
        }
    }
}

static int
npyiter_allocate_transfer_functions(NpyIter *iter)
{
    npy_uint32 itflags = NIT_ITFLAGS(iter);
    /*int ndim = NIT_NDIM(iter);*/
    int iop = 0, nop = NIT_NOP(iter);

    npy_intp i;
    npyiter_opitflags *op_itflags = NIT_OPITFLAGS(iter);
    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
    NpyIter_AxisData *axisdata = NIT_AXISDATA(iter);
    PyArrayObject **op = NIT_OPERANDS(iter);
    PyArray_Descr **op_dtype = NIT_DTYPES(iter);
    npy_intp *strides = NAD_STRIDES(axisdata), op_stride;
    NpyIter_TransferInfo *transferinfo = NBF_TRANSFERINFO(bufferdata);

    /* combined cast flags, the new cast flags for each cast: */
    NPY_ARRAYMETHOD_FLAGS cflags = PyArrayMethod_MINIMAL_FLAGS;
    NPY_ARRAYMETHOD_FLAGS nc_flags;

    for (iop = 0; iop < nop; ++iop) {
        npyiter_opitflags flags = op_itflags[iop];
        /*
         * Reduction operands may be buffered with a different stride,
         * so we must pass NPY_MAX_INTP to the transfer function factory.
         */
        op_stride = (flags & NPY_OP_ITFLAG_REDUCE) ? NPY_MAX_INTP :
                                                   strides[iop];

        /*
         * If we have determined that a buffer may be needed,
         * allocate the appropriate transfer functions
         */
        if (!(flags & NPY_OP_ITFLAG_BUFNEVER)) {
            if (flags & NPY_OP_ITFLAG_READ) {
                int move_references = 0;
                if (PyArray_GetDTypeTransferFunction(
                                        (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
                                        op_stride,
                                        op_dtype[iop]->elsize,
                                        PyArray_DESCR(op[iop]),
                                        op_dtype[iop],
                                        move_references,
                                        &transferinfo[iop].read,
                                        &nc_flags) != NPY_SUCCEED) {
                    iop -= 1;  /* This one cannot be cleaned up yet. */
                    goto fail;
                }
                cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
            }
            else {
                transferinfo[iop].read.func = NULL;
            }
            if (flags & NPY_OP_ITFLAG_WRITE) {
                int move_references = 1;

                /* If the operand is WRITEMASKED, use a masked transfer fn */
                if (flags & NPY_OP_ITFLAG_WRITEMASKED) {
                    int maskop = NIT_MASKOP(iter);
                    PyArray_Descr *mask_dtype = PyArray_DESCR(op[maskop]);

                    /*
                     * If the mask's stride is contiguous, use it, otherwise
                     * the mask may or may not be buffered, so the stride
                     * could be inconsistent.
                     */
                    if (PyArray_GetMaskedDTypeTransferFunction(
                            (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
                            op_dtype[iop]->elsize,
                            op_stride,
                            (strides[maskop] == mask_dtype->elsize) ?
                                mask_dtype->elsize : NPY_MAX_INTP,
                            op_dtype[iop],
                            PyArray_DESCR(op[iop]),
                            mask_dtype,
                            move_references,
                            &transferinfo[iop].write,
                            &nc_flags) != NPY_SUCCEED) {
                        goto fail;
                    }
                    cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                }
                else {
                    if (PyArray_GetDTypeTransferFunction(
                            (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
                            op_dtype[iop]->elsize,
                            op_stride,
                            op_dtype[iop],
                            PyArray_DESCR(op[iop]),
                            move_references,
                            &transferinfo[iop].write,
                            &nc_flags) != NPY_SUCCEED) {
                        goto fail;
                    }
                    cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
                }
            }
            else {
                transferinfo[iop].write.func = NULL;
            }

            /* Get the decref function, used for no-writeback and on error */
            if (PyDataType_REFCHK(op_dtype[iop])) {
                /*
                 * By passing NULL to dst_type and setting move_references
                 * to 1, we get back a function that just decrements the
                 * src references.
                 */
                if (PyArray_GetClearFunction(
                        (flags & NPY_OP_ITFLAG_ALIGNED) != 0,
                        op_dtype[iop]->elsize, op_dtype[iop],
                        &transferinfo[iop].clear, &nc_flags) < 0) {
                    goto fail;
                }
                cflags = PyArrayMethod_COMBINED_FLAGS(cflags, nc_flags);
            }
            else {
                transferinfo[iop].clear.func = NULL;
            }
        }
        else {
            transferinfo[iop].read.func = NULL;
            transferinfo[iop].write.func = NULL;
            transferinfo[iop].clear.func = NULL;
        }
    }

    /* Store the combined transfer flags on the iterator */
    NIT_ITFLAGS(iter) |= cflags << NPY_ITFLAG_TRANSFERFLAGS_SHIFT;
    assert(NIT_ITFLAGS(iter) >> NPY_ITFLAG_TRANSFERFLAGS_SHIFT == cflags);

    /* If any of the dtype transfer functions needed the API, flag it. */
    if (cflags & NPY_METH_REQUIRES_PYAPI) {
        NIT_ITFLAGS(iter) |= NPY_ITFLAG_NEEDSAPI;
    }

    return 1;

fail:
    for (i = 0; i < iop+1; ++i) {
        NPY_cast_info_xfree(&transferinfo[iop].read);
        NPY_cast_info_xfree(&transferinfo[iop].write);
    }
    return 0;
}

#undef NPY_ITERATOR_IMPLEMENTATION_CODE