5 files changed, 632 insertions, 165 deletions
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 75f89945a..b943f17de 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -301,6 +301,7 @@ multiarray_funcs_api = {
     'PyArray_CanCastArrayTo':               266,
     'PyArray_CanCastTypeTo':                267,
     'PyArray_EinsteinSum':                  268,
+    'PyArray_FillWithZero':                 269,
 }
 
 ufunc_types_api = {
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index a2c965181..5aef72d45 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -13,6 +13,7 @@
 
 #include "arrayobject.h"
 #include "mapping.h"
+#include "lowlevel_strided_loops.h"
 
 #include "convert.h"
 
@@ -329,8 +330,104 @@ PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj)
 }
 
 /*NUMPY_API
-  Copy an array.
-*/
+ * Fills an array with zeros.
+ *
+ * Returns 0 on success, -1 on failure.
+ */
+NPY_NO_EXPORT int
+PyArray_FillWithZero(PyArrayObject *a)
+{
+    PyArray_StridedTransferFn *stransfer = NULL;
+    void *transferdata = NULL;
+    PyArray_Descr *dtype = PyArray_DESCR(a);
+    NpyIter *iter;
+
+    NpyIter_IterNext_Fn iternext;
+    char **dataptr;
+    npy_intp stride, *countptr;
+    int needs_api;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArray_ISWRITEABLE(a)) {
+        PyErr_SetString(PyExc_RuntimeError, "cannot write to array");
+        return -1;
+    }
+
+    /* A zero-sized array needs no zeroing */
+    if (PyArray_SIZE(a) == 0) {
+        return 0;
+    }
+
+    /* If it's possible to do a simple memset, do so */
+    if (!PyDataType_REFCHK(dtype) && (PyArray_ISCONTIGUOUS(a) ||
+                                      PyArray_ISFORTRAN(a))) {
+        memset(PyArray_DATA(a), 0, PyArray_NBYTES(a));
+        return 0;
+    }
+
+    /* Use an iterator to go through all the data */
+    iter = NpyIter_New(a, NPY_ITER_WRITEONLY|NPY_ITER_NO_INNER_ITERATION,
+                    NPY_KEEPORDER, NPY_NO_CASTING,
+                    NULL, 0, NULL, 0);
+    
+    if (iter == NULL) {
+        return -1;
+    }
+
+    iternext = NpyIter_GetIterNext(iter, NULL);
+    if (iternext == NULL) {
+        NpyIter_Deallocate(iter);
+        return -1;
+    }
+    dataptr = NpyIter_GetDataPtrArray(iter);
+    stride = NpyIter_GetInnerStrideArray(iter)[0];
+    countptr = NpyIter_GetInnerLoopSizePtr(iter);
+
+    needs_api = NpyIter_IterationNeedsAPI(iter);
+
+    /*
+     * Because buffering is disabled in the iterator, the inner loop
+     * strides will be the same throughout the iteration loop.  Thus,
+     * we can pass them to this function to take advantage of
+     * contiguous strides, etc.
+     *
+     * By setting the src_dtype to NULL, we get a function which sets
+     * the destination to zeros.
+     */
+    if (PyArray_GetDTypeTransferFunction(
+                    PyArray_ISALIGNED(a),
+                    0, stride,
+                    NULL, PyArray_DESCR(a),
+                    0,
+                    &stransfer, &transferdata,
+                    &needs_api) != NPY_SUCCEED) {
+        NpyIter_Deallocate(iter);
+        return -1;
+    }
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS;
+    }
+
+    do {
+        stransfer(NULL, 0, *dataptr, stride,
+                    *countptr, 0, transferdata);
+    } while(iternext(iter));
+
+    if (!needs_api) {
+        NPY_END_THREADS;
+    }
+
+    PyArray_FreeStridedTransferData(transferdata);
+    NpyIter_Deallocate(iter);
+
+    return 0;
+}
+
+/*NUMPY_API
+ * Copy an array.
+ */
 NPY_NO_EXPORT PyObject *
 PyArray_NewCopy(PyArrayObject *m1, NPY_ORDER fortran)
 {
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index d7dc73a23..97905ca54 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -17,6 +17,12 @@
 
 #include <ctype.h>
 
+typedef enum {
+    BROADCAST_LEFT,
+    BROADCAST_RIGHT,
+    BROADCAST_MIDDLE
+} EINSUM_BROADCAST;
+
 /*
  * Parses the subscripts for one operand into an output
  * of 'ndim' labels
@@ -28,9 +34,11 @@ parse_operand_subscripts(char *subscripts, int length,
                         char *out_label_counts,
                         int *out_min_label,
                         int *out_max_label,
-                        int *out_num_labels)
+                        int *out_num_labels,
+                        EINSUM_BROADCAST *out_broadcast)
 {
     int i, idim, ndim_left, label;
+    int left_labels = 0, right_labels = 0;
 
     printf("Parsing operand %d subscripts\n", iop);
 
@@ -55,6 +63,7 @@ parse_operand_subscripts(char *subscripts, int length,
                     (*out_num_labels)++;
                 }
                 out_label_counts[label]++;
+                right_labels = 1;
             }
             else {
                 PyErr_Format(PyExc_ValueError,
@@ -114,6 +123,7 @@ parse_operand_subscripts(char *subscripts, int length,
                         (*out_num_labels)++;
                     }
                     out_label_counts[label]++;
+                    left_labels = 1;
                 }
                 else {
                     PyErr_Format(PyExc_ValueError,
@@ -161,6 +171,16 @@ parse_operand_subscripts(char *subscripts, int length,
         }
     }
 
+    if (left_labels && right_labels) {
+        *out_broadcast = BROADCAST_MIDDLE;
+    }
+    else if (!left_labels) {
+        *out_broadcast = BROADCAST_RIGHT;
+    }
+    else {
+        *out_broadcast = BROADCAST_LEFT;
+    }
+
     return 1;
 }
 
@@ -173,9 +193,11 @@ static int
 parse_output_subscripts(char *subscripts, int length,
                         int ndim_broadcast,
                         const char *label_counts,
-                        char *out_labels)
+                        char *out_labels,
+                        EINSUM_BROADCAST *out_broadcast)
 {
     int i, nlabels, label, idim, ndim, ndim_left;
+    int left_labels = 0, right_labels = 0;
 
     /* Count the labels, making sure they're all unique and valid */
     nlabels = 0;
@@ -230,6 +252,7 @@ parse_output_subscripts(char *subscripts, int length,
                             "too many output subscripts");
                 return -1;
             }
+            right_labels = 1;
         }
         /* The end of the ellipsis */
         else {
@@ -269,6 +292,7 @@ parse_output_subscripts(char *subscripts, int length,
                                 "too many subscripts for the output");
                     return -1;
                 }
+                left_labels = 1;
             }
             else {
                 PyErr_SetString(PyExc_ValueError,
@@ -284,9 +308,353 @@ parse_output_subscripts(char *subscripts, int length,
         out_labels[idim++] = 0;
     }
 
+    if (left_labels && right_labels) {
+        *out_broadcast = BROADCAST_MIDDLE;
+    }
+    else if (!left_labels) {
+        *out_broadcast = BROADCAST_RIGHT;
+    }
+    else {
+        *out_broadcast = BROADCAST_LEFT;
+    }
+
     return ndim;
 }
 
+
+/*
+ * When there's just one operand and no reduction, we
+ * can return a view into op.  This calculates the view
+ * if possible.
+ */
+static int
+get_single_op_view(PyArrayObject *op, int  iop, char *labels,
+                   int ndim_output, char *output_labels,
+                   PyArrayObject **ret)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    char *out_label;
+    int label, i, idim, ndim, ibroadcast = 0;
+
+    ndim = PyArray_NDIM(op);
+
+    /* Initialize the dimensions and strides to zero */
+    for (idim = 0; idim < ndim_output; ++idim) {
+        new_dims[idim] = 0;
+        new_strides[idim] = 0;
+    }
+
+    /* Match the labels in the operand with the output labels */
+    for (idim = 0; idim < ndim; ++idim) {
+        printf("Matching label for dimension %d\n", idim);
+        label = labels[idim];
+        /* If this label says to merge axes, get the actual label */
+        if (label < 0) {
+            label = labels[idim+label];
+        }
+        /* If the label is 0, it's an unlabeled broadcast dimension */
+        if (label == 0) {
+            /* The next output label that's a broadcast dimension */
+            for (; ibroadcast < ndim_output; ++ibroadcast) {
+                if (output_labels[ibroadcast] == 0) {
+                    break;
+                }
+            }
+            if (ibroadcast == ndim_output) {
+                PyErr_SetString(PyExc_ValueError,
+                        "output had too few broadcast dimensions");
+                return 0;
+            }
+            new_dims[ibroadcast] = PyArray_DIM(op, idim);
+            new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
+            ++ibroadcast;
+        }
+        else {
+            /* Find the position for this dimension in the output */
+            out_label = (char *)memchr(output_labels, label,
+                                                    ndim_output);
+            /* If it's not found, reduction -> can't return a view */
+            if (out_label == NULL) {
+                break;
+            }
+            /* Update the dimensions and strides of the output */
+            i = out_label - output_labels;
+            if (new_dims[i] != 0 &&
+                    new_dims[i] != PyArray_DIM(op, idim)) {
+                PyErr_Format(PyExc_ValueError,
+                        "dimensions in operand %d for collapsing "
+                        "index '%c' don't match (%d != %d)",
+                        iop, label, (int)new_dims[i],
+                        (int)PyArray_DIM(op, idim));
+                return 0;
+            }
+            new_dims[i] = PyArray_DIM(op, idim);
+            new_strides[i] += PyArray_STRIDE(op, idim);
+        }
+    }
+    /* If we processed all the input axes, return a view */
+    if (idim == ndim) {
+        printf("Returning a view\n");
+        Py_INCREF(PyArray_DESCR(op));
+        *ret = (PyArrayObject *)PyArray_NewFromDescr(
+                                Py_TYPE(op),
+                                PyArray_DESCR(op),
+                                ndim_output, new_dims, new_strides,
+                                PyArray_DATA(op),
+                                0, (PyObject *)op);
+
+        if (*ret == NULL) {
+            return 0;
+        }
+        if (!PyArray_Check(*ret)) {
+            Py_DECREF(*ret);
+            *ret = NULL;
+            PyErr_SetString(PyExc_RuntimeError,
+                        "NewFromDescr failed to return an array");
+            return 0;
+        }
+        PyArray_UpdateFlags(*ret,
+                    NPY_C_CONTIGUOUS|NPY_ALIGNED|NPY_F_CONTIGUOUS);
+        Py_INCREF(op);
+        PyArray_BASE(*ret) = (PyObject *)op;
+        return 1;
+    }
+
+    /* Return success, but that we couldn't make a view */
+    *ret = NULL;
+    return 1;
+}
+
+static PyArrayObject *
+get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    int i, idim, ndim, icombine, combineoffset, label;
+    int icombinemap[NPY_MAXDIMS];
+
+    PyArrayObject *ret = NULL;
+
+    ndim = PyArray_NDIM(op);
+
+    /* Initialize the dimensions and strides to zero */
+    for (idim = 0; idim < ndim; ++idim) {
+        new_dims[idim] = 0;
+        new_strides[idim] = 0;
+    }
+
+    /* Copy the dimensions and strides, except when collapsing */
+    icombine = 0;
+    for (idim = 0; idim < ndim; ++idim) {
+        printf("Processing dimension %d\n", idim);
+        label = labels[idim];
+        /* If this label says to merge axes, get the actual label */
+        if (label < 0) {
+            combineoffset = label;
+            label = labels[idim+label];
+        }
+        else {
+            combineoffset = 0;
+            if (icombine != idim) {
+                labels[icombine] = labels[idim];
+            }
+            icombinemap[idim] = icombine;
+        }
+        /* If the label is 0, it's an unlabeled broadcast dimension */
+        if (label == 0) {
+            new_dims[icombine] = PyArray_DIM(op, idim);
+            new_strides[icombine] = PyArray_STRIDE(op, idim);
+        }
+        else {
+            /* Update the combined axis dimensions and strides */
+            i = idim + combineoffset;
+            if (combineoffset < 0 &&
+                        new_dims[i] != PyArray_DIM(op, idim)) {
+                PyErr_Format(PyExc_ValueError,
+                        "dimensions in operand %d for collapsing "
+                        "index '%c' don't match (%d != %d)",
+                        iop, label, (int)new_dims[i],
+                        (int)PyArray_DIM(op, idim));
+                return NULL;
+            }
+            i = icombinemap[i];
+            new_dims[i] = PyArray_DIM(op, idim);
+            new_strides[i] += PyArray_STRIDE(op, idim);
+        }
+
+        /* If the label didn't say to combine axes, increment dest i */
+        if (combineoffset == 0) {
+            icombine++;
+        }
+    }
+
+    /* The compressed number of dimensions */
+    ndim = icombine;
+
+    Py_INCREF(PyArray_DESCR(op));
+    ret = (PyArrayObject *)PyArray_NewFromDescr(
+                            Py_TYPE(op),
+                            PyArray_DESCR(op),
+                            ndim, new_dims, new_strides,
+                            PyArray_DATA(op),
+                            PyArray_ISWRITEABLE(op) ? NPY_WRITEABLE : 0,
+                            (PyObject *)op);
+
+    if (ret == NULL) {
+        return NULL;
+    }
+    if (!PyArray_Check(ret)) {
+        Py_DECREF(ret);
+        PyErr_SetString(PyExc_RuntimeError,
+                    "NewFromDescr failed to return an array");
+        return NULL;
+    }
+    PyArray_UpdateFlags(ret,
+                NPY_C_CONTIGUOUS|NPY_ALIGNED|NPY_F_CONTIGUOUS);
+    Py_INCREF(op);
+    PyArray_BASE(ret) = (PyObject *)op;
+
+    return ret;
+}
+
+static int
+prepare_op_axes(int ndim, int iop, char *labels, npy_intp *axes,
+            npy_intp ndim_iter, char *iter_labels, EINSUM_BROADCAST broadcast)
+{
+    int i, label, ibroadcast;
+
+    /* Regular broadcasting */
+    if (broadcast == BROADCAST_RIGHT) {
+        /* broadcast dimensions get placed in rightmost position */
+        ibroadcast = ndim-1; 
+        for (i = ndim_iter-1; i >= 0; --i) {
+            label = iter_labels[i];
+            /*
+             * If it's an unlabeled broadcast dimension, choose
+             * the next broadcast dimension from the operand.
+             */
+            if (label == 0) {
+                while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
+                    --ibroadcast;
+                }
+                /*
+                 * If we used up all the operand broadcast dimensions,
+                 * extend it with a "newaxis"
+                 */
+                if (ibroadcast < 0) {
+                    axes[i] = -1;
+                }
+                /* Otherwise map to the broadcast axis */
+                else {
+                    axes[i] = ibroadcast;
+                }
+            }
+            /* It's a labeled dimension, find the matching one */
+            else {
+                char *match = memchr(labels, label, ndim);
+                /* If the op doesn't have the label, broadcast it */
+                if (match == NULL) {
+                    axes[i] = -1;
+                }
+                /* Otherwise use it */
+                else {
+                    axes[i] = match - labels;
+                }
+            }
+        }
+    }
+    /* Reverse broadcasting */
+    else if (broadcast == BROADCAST_LEFT) {
+        /* broadcast dimensions get placed in leftmost position */
+        ibroadcast = 0; 
+        for (i = 0; i < ndim_iter; ++i) {
+            label = iter_labels[i];
+            /*
+             * If it's an unlabeled broadcast dimension, choose
+             * the next broadcast dimension from the operand.
+             */
+            if (label == 0) {
+                while (ibroadcast < ndim && labels[ibroadcast] != 0) {
+                    ++ibroadcast;
+                }
+                /*
+                 * If we used up all the operand broadcast dimensions,
+                 * extend it with a "newaxis"
+                 */
+                if (ibroadcast >= ndim) {
+                    axes[i] = -1;
+                }
+                /* Otherwise map to the broadcast axis */
+                else {
+                    axes[i] = ibroadcast;
+                }
+            }
+            /* It's a labeled dimension, find the matching one */
+            else {
+                char *match = memchr(labels, label, ndim);
+                /* If the op doesn't have the label, broadcast it */
+                if (match == NULL) {
+                    axes[i] = -1;
+                }
+                /* Otherwise use it */
+                else {
+                    axes[i] = match - labels;
+                }
+            }
+        }
+    }
+    /* Middle broadcasting */
+    else {
+        /* broadcast dimensions get placed in leftmost position */
+        ibroadcast = 0; 
+        for (i = 0; i < ndim_iter; ++i) {
+            label = iter_labels[i];
+            /*
+             * If it's an unlabeled broadcast dimension, choose
+             * the next broadcast dimension from the operand.
+             */
+            if (label == 0) {
+                while (ibroadcast < ndim && labels[ibroadcast] != 0) {
+                    ++ibroadcast;
+                }
+                /*
+                 * If we used up all the operand broadcast dimensions,
+                 * it's an error
+                 */
+                if (ibroadcast >= ndim) {
+                    PyErr_Format(PyExc_ValueError,
+                            "operand %d did not have enough dimensions "
+                            "to match the broadcasting, and couldn't be "
+                            "extended because einstein sum subscripts "
+                            "were specified at both the start and end",
+                            iop);
+                    return 0;
+                }
+                /* Otherwise map to the broadcast axis */
+                else {
+                    axes[i] = ibroadcast;
+                }
+            }
+            /* It's a labeled dimension, find the matching one */
+            else {
+                char *match = memchr(labels, label, ndim);
+                /* If the op doesn't have the label, broadcast it */
+                if (match == NULL) {
+                    axes[i] = -1;
+                }
+                /* Otherwise use it */
+                else {
+                    axes[i] = match - labels;
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+
 /*NUMPY_API
  * This function provides summation of array elements according to
  * the Einstein summation convention.  For example:
@@ -312,9 +680,11 @@ parse_output_subscripts(char *subscripts, int length,
  * then c[i,j] = a[j]*b[i].
  *
  * Alternatively, you can control the output order or prevent
- * an axis from being summed by providing indices for the output.
- * This allows us to turn 'trace' into 'diag', for example.
+ * an axis from being summed/force an axis to be summed by providing
+ * indices for the output. This allows us to turn 'trace' into
+ * 'diag', for example.
  *  - diag(a)         -> einsum("ii->i", a)
+ *  - sum(a, axis=0)  -> einsum("i...->", a)
  *
  * Subscripts at the beginning and end may be specified by
  * putting an ellipsis "..." in the middle.  For example,
@@ -337,12 +707,21 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     int iop, label, min_label = 127, max_label = 0, num_labels;
     char label_counts[128];
     char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
-    char output_labels[NPY_MAXDIMS];
-    int idim, ndim_output, ndim_broadcast;
+    char output_labels[NPY_MAXDIMS], *iter_labels;
+    int idim, ndim_output, ndim_broadcast, ndim_iter;
 
-    PyArrayObject *op[NPY_MAXARGS];
+    EINSUM_BROADCAST broadcast[NPY_MAXARGS];
+    PyArrayObject *op[NPY_MAXARGS], *ret = NULL;
+    PyArray_Descr *op_dtypes_array[NPY_MAXARGS], **op_dtypes;
 
-    if (nop > NPY_MAXARGS) {
+    npy_intp op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+    npy_intp *op_axes[NPY_MAXARGS];
+    npy_uint32 op_flags[NPY_MAXARGS];
+
+    NpyIter *iter;
+
+    /* nop+1 (+1 is for the output) must fit in NPY_MAXARGS */
+    if (nop >= NPY_MAXARGS) {
         PyErr_SetString(PyExc_ValueError,
                     "too many operands provided to einstein sum function");
         return NULL;
@@ -376,7 +755,8 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         if (!parse_operand_subscripts(subscripts, length,
                         PyArray_NDIM(op_in[iop]),
                         iop, op_labels[iop], label_counts,
-                        &min_label, &max_label, &num_labels)) {
+                        &min_label, &max_label, &num_labels,
+                        &broadcast[iop])) {
             return NULL;
         }
         
@@ -433,7 +813,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         /* Parse the output subscript string */
         ndim_output = parse_output_subscripts(outsubscripts, length,
                                         ndim_broadcast, label_counts,
-                                        output_labels);
+                                        output_labels, &broadcast[nop]);
     }
     else {
         if (subscripts[0] != '-' || subscripts[1] != '>') {
@@ -447,12 +827,20 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         /* Parse the output subscript string */
         ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
                                         ndim_broadcast, label_counts,
-                                        output_labels);
+                                        output_labels, &broadcast[nop]);
     }
     if (ndim_output < 0) {
         return NULL;
     }
 
+    if (out != NULL && PyArray_NDIM(out) != ndim_output) {
+        PyErr_Format(PyExc_ValueError,
+                "out parameter does not have the correct number of "
+                "dimensions, has %d but should have %d",
+                (int)PyArray_NDIM(out), (int)ndim_output);
+        return NULL;
+    }
+
     /* Set all the op references to NULL */
     for (iop = 0; iop < nop; ++iop) {
         op[iop] = NULL;
@@ -464,8 +852,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
      */
     printf("Processing inputs\n");
     for (iop = 0; iop < nop; ++iop) {
-        npy_intp new_strides[NPY_MAXDIMS];
-        npy_intp new_dims[NPY_MAXDIMS];
         char *labels = op_labels[iop];
         int combine, ndim;
 
@@ -476,86 +862,16 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
          * first try remapping the axes to the output to return
          * a view instead of a copy.
          */
-        if (nop == 1 && out == NULL) {
-            char *out_label;
-            int i, ibroadcast = 0;
-
-            /* Initialize the dimensions and strides to zero */
-            for (idim = 0; idim < ndim_output; ++idim) {
-                new_dims[idim] = 0;
-                new_strides[idim] = 0;
-            }
-            /* Match the labels in the operand with the output labels */
-            for (idim = 0; idim < ndim; ++idim) {
-                printf("Matching label for dimension %d\n", idim);
-                label = labels[idim];
-                /* If this label says to merge axes, get the actual label */
-                if (label < 0) {
-                    label = labels[idim+label];
-                }
-                /* If the label is 0, it's an unlabeled broadcast dimension */
-                if (label == 0) {
-                    /* The next output label that's a broadcast dimension */
-                    for (; ibroadcast < ndim_output; ++ibroadcast) {
-                        if (output_labels[ibroadcast] == 0) {
-                            break;
-                        }
-                    }
-                    if (ibroadcast == ndim_output) {
-                        PyErr_SetString(PyExc_ValueError,
-                                "output had too few broadcast dimensions");
-                        return NULL;
-                    }
-                    new_dims[ibroadcast] = PyArray_DIM(op_in[iop], idim);
-                    new_strides[ibroadcast] = PyArray_STRIDE(op_in[iop], idim);
-                    ++ibroadcast;
-                }
-                else {
-                    /* Find the position for this dimension in the output */
-                    out_label = (char *)memchr(output_labels, label,
-                                                            ndim_output);
-                    /* If it's not found, reduction -> can't return a view */
-                    if (out_label == NULL) {
-                        break;
-                    }
-                    /* Update the dimensions and strides of the output */
-                    i = out_label - output_labels;
-                    if (new_dims[i] != 0 &&
-                            new_dims[i] != PyArray_DIM(op_in[iop], idim)) {
-                        PyErr_Format(PyExc_ValueError,
-                                "dimensions in operand %d for collapsing "
-                                "index '%c' don't match (%d != %d)",
-                                iop, label, (int)new_dims[i],
-                                (int)PyArray_DIM(op_in[iop], idim));
-                        return NULL;
-                    }
-                    new_dims[i] = PyArray_DIM(op_in[iop], idim);
-                    new_strides[i] += PyArray_STRIDE(op_in[iop], idim);
-                }
+        if (iop == 0 && nop == 1 && out == NULL) {
+            PyArrayObject *ret = NULL;
+
+            if (!get_single_op_view(op_in[iop], iop, labels,
+                                    ndim_output, output_labels,
+                                    &ret)) {
+                return NULL;
             }
-            /* If we processed all the input axes, return a view */
-            if (idim == ndim) {
-                printf("Returning a view\n");
-                Py_INCREF(PyArray_DESCR(op_in[iop]));
-                op[iop] = (PyArrayObject *)PyArray_NewFromDescr(
-                                        Py_TYPE(op_in[iop]),
-                                        PyArray_DESCR(op_in[iop]),
-                                        ndim_output, new_dims, new_strides,
-                                        PyArray_DATA(op_in[iop]),
-                                        0, (PyObject *)op_in[iop]);
-
-                if (op[iop] == NULL) {
-                    return NULL;
-                }
-                if (!PyArray_Check(op[iop])) {
-                    Py_DECREF(op[iop]);
-                    PyErr_SetString(PyExc_RuntimeError,
-                                "NewFromDescr failed to return an array");
-                    return NULL;
-                }
-                Py_INCREF(op_in[iop]);
-                PyArray_BASE(op[iop]) = (PyObject *)op_in[iop];
-                return op[iop];
+            if (ret != NULL) {
+                return ret;
             }
         }
 
@@ -567,86 +883,115 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             }
         }
 
-        PyErr_SetString(PyExc_RuntimeError, "not implemented yet");
-        return NULL;
+        /* If any dimensions are combined, create a view which combines them */
+        if (combine) {
+            op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
+            if (op[iop] == NULL) {
+                goto fail;
+            }
+        }
+        /* No combining needed */
+        else {
+            Py_INCREF(op_in[iop]);
+            op[iop] = op_in[iop];
+        }
     }
 
-    PyErr_SetString(PyExc_RuntimeError, "not implemented yet");
-    return NULL;
+    /* Set the output op */
+    op[nop] = out;
 
-#if 0
-        /* Copy the dimensions before the labels without changes */
-        for (idim = 0; idim < ndim-nlabels; ++i) {
-            new_strides[idim] = PyArray_STRIDE(op[iop], idim);
-            new_dims[idim] = PyArray_DIM(op[iop], idim);
-        }
-        /* Process the labels for the rest */
-        for (ilabels = 0; ilabels < nlabels; ++ilabels) {
-            label = labels[ilabels];
-            if (label_counts[label] > 1) {
-                /* If this label occurs earlier, merge it */
-                for (jlabels = 0; jlabels < ilabels; ++jlabels) {
-                    if (labels[jlabels] == label) {
-                        if (new_dims[jlabels+ndims-nlabels] !=
-                                PyArray_DIM(op[iop], ilabels+ndims-nlabels)) {
-                            PyErr_Format(PyExc_ValueError,
-                                    "operand %d has a repeated label, but "
-                                    "the dimensions for the label don't match",
-                                    (int)iop);
-                            goto fail;
-                        }
-                        new_strides[jlabels+ndims-nlabels] +=
-                                PyArray_STRIDE(op[iop], ilabels+ndims-nlabels);
-                        /* Indicate this label should be removed */
-                        labels[ilabels] = 0;
-                        break;
-                    }
-                }
-            }
-        }
-        /* Collapse the labels that got cleared */
-        jlabels = 0;
-        for (ilabels = 0; ilabels < nlabels; ++ilabels) {
-            if (labels[ilabels] != 0) {
-                if (jlabels != ilabels) {
-                    labels[jlabels] = labels[ilabels];
-                    i = ilabels+ndims-nlabels;
-                    new_strides[idim] = new_strides[i];
-                    new_dims[idim] = new_dims[i];
-                }
-                idim++;
-                jlabels++;
-            }
-            else {
-                op_nlabels[iop]--;
-            }
-        }
-        /* If the dimensions shrunk, make a new view */
-        if (idim != ndim) {
-            ndim = idim;
-            Py_INCREF(PyArray_DESCR(op_in[iop]));
-            op[iop] = PyArray_NewFromDescr(&PyArray_Type,
-                                        PyArray_DESCR(op_in[iop]),
-                                        ndim, new_dims, new_strides,
-                                        PyArray_DATA(op_in[iop]),
-                                        0, NULL);
-            if (op[iop] == NULL) {
+    /*
+     * Set up the labels for the iterator (output + combined labels).
+     * Can just share the output_labels memory, because iter_labels
+     * is output_labels with some more labels appended.
+     */
+    iter_labels = output_labels;
+    ndim_iter = ndim_output;
+    for (label = min_label; label <= max_label; ++label) {
+        if (label_counts[label] > 0 &&
+                memchr(output_labels, label, ndim_output) == NULL) {
+            if (ndim_iter >= NPY_MAXDIMS) {
+                PyErr_SetString(PyExc_ValueError,
+                            "too many subscripts in einsum");
                 goto fail;
             }
-            Py_INCREF(op_in[iop]);
-            PyArray_BASE(op[iop]) = (PyObject *)op_in[iop];
+            iter_labels[ndim_iter++] = label;
         }
-        /* Otherwise just take a reference */
-        else {
-            op[iop] = op_in[iop]
-            Py_INCREF(op[iop]);
+    }
+
+    /* Set up the op_axes for the iterator */
+    for (iop = 0; iop < nop; ++iop) {
+        op_axes[iop] = op_axes_arrays[iop];
+
+        if (!prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels, broadcast[iop])) {
+            goto fail;
         }
     }
-goto fail;
+
+    /* Set up the op_dtypes if dtype was provided */
+    if (dtype == NULL) {
+        op_dtypes = NULL;
+    }
+    else {
+        op_dtypes = op_dtypes_array;
+        for (iop = 0; iop <= nop; ++iop) {
+            op_dtypes[iop] = dtype;
+        }
+    }
+
+    /* Set the op_axes for the output */
+    op_axes[nop] = op_axes_arrays[nop];
+    for (idim = 0; idim < ndim_output; ++idim) {
+        op_axes[nop][idim] = idim;
+    }
+    for (idim = ndim_output; idim < ndim_iter; ++idim) {
+        op_axes[nop][idim] = -1;
+    }
+
+    /* Set the iterator per-op flags */
+
+    for (iop = 0; iop < nop; ++iop) {
+        op_flags[iop] = NPY_ITER_READONLY|
+                        NPY_ITER_NBO|
+                        NPY_ITER_ALIGNED;
+    }
+    op_flags[nop] = NPY_ITER_READWRITE|
+                    NPY_ITER_NBO|
+                    NPY_ITER_ALIGNED|
+                    NPY_ITER_ALLOCATE|
+                    NPY_ITER_NO_BROADCAST;
+
+    /* Allocate the iterator */
+    iter = NpyIter_MultiNew(nop+1, op, NPY_ITER_NO_INNER_ITERATION|
+                ((dtype == NULL) ? 0 : NPY_ITER_COMMON_DTYPE)|
+                                       NPY_ITER_BUFFERED|
+                                       NPY_ITER_DELAY_BUFALLOC|
+                                       NPY_ITER_GROWINNER|
+                                       NPY_ITER_REDUCE_OK,
+                                       order, casting,
+                                       op_flags, op_dtypes,
+                                       ndim_iter, op_axes, 0);
+    
+    if (iter == NULL) {
+        goto fail;
+    }
+
+    /* Initialize the output to all zeros */
+    ret = NpyIter_GetOperandArray(iter)[nop];
+    Py_INCREF(ret);
+    PyArray_FillWithZero(ret);
+    NpyIter_Reset(iter, NULL);
+
+    NpyIter_DebugPrint(iter);
+    NpyIter_Deallocate(iter);
+
+    return ret;
+
+fail:
     for (iop = 0; iop < nop; ++iop) {
         Py_XDECREF(op[iop]);
     }
 
     return NULL;
-#endif
 }
diff --git a/numpy/core/src/multiarray/new_iterator.c.src b/numpy/core/src/multiarray/new_iterator.c.src
index 78725f8c9..30ca459ab 100644
--- a/numpy/core/src/multiarray/new_iterator.c.src
+++ b/numpy/core/src/multiarray/new_iterator.c.src
@@ -18,7 +18,7 @@
 #include "lowlevel_strided_loops.h"
 
 /********** PRINTF DEBUG TRACING **************/
-#define NPY_IT_DBG_TRACING 0
+#define NPY_IT_DBG_TRACING 1
 
 #if NPY_IT_DBG_TRACING
 #define NPY_IT_DBG_PRINTF(...) printf(__VA_ARGS__)
@@ -2555,6 +2555,8 @@ npyiter_prepare_one_operand(PyArrayObject **op,
                 Py_DECREF(*op_dtype);
                 *op_dtype = nbo_dtype;
 
+                NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                    "because of NPY_ITER_NBO\n");
                 /* Indicate that byte order or alignment needs fixing */
                 *op_itflags |= NPY_OP_ITFLAG_CAST;
             }
@@ -2563,6 +2565,8 @@ npyiter_prepare_one_operand(PyArrayObject **op,
         if (op_flags&NPY_ITER_ALIGNED) {
             /* Check alignment */
             if (!PyArray_ISALIGNED(*op)) {
+                NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                    "because of NPY_ITER_ALIGNED\n");
                 *op_itflags |= NPY_OP_ITFLAG_CAST;
             }
         }
@@ -2709,6 +2713,8 @@ npyiter_check_casting(npy_intp niter, PyArrayObject **op,
                 return 0;
             }
 
+            NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                "because the types aren't equivalent\n");
             /* Indicate that this operand needs casting */
             op_itflags[iiter] |= NPY_OP_ITFLAG_CAST;
         }
@@ -3784,6 +3790,9 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
         }
 
         if (i >= 0) {
+            NPY_IT_DBG_PRINTF("Iterator: Setting allocated stride %d "
+                                "for iterator dimension %d to %d\n", (int)i,
+                                (int)idim, (int)stride);
             strides[i] = stride;
             if (p < 0) {
                 reversestride[i] = 1;
@@ -3808,7 +3817,13 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
 
         axisdata = NIT_AXISDATA(iter);
         for (idim = 0; idim < op_ndim; ++idim) {
+            NPY_IT_DBG_PRINTF("Iterator: Checking allocated output "
+                              "dimension %d with stride %d\n",
+                              (int)idim, (int)strides[idim]);
             if (strides[idim] == NPY_MAX_INTP) {
+                NPY_IT_DBG_PRINTF("Iterator: The axis wasn't used, "
+                                  "and its dimension is %d\n",
+                                  (int)NAD_SHAPE(axisdata));
                 /*
                  * If deleting this axis produces a reduction, but
                  * reduction wasn't enabled, throw an error
@@ -3827,6 +3842,8 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                         return NULL;
                     }
 
+                    NPY_IT_DBG_PRINTF("Iterator: Indicating that a reduction "
+                                      "is occurring\n");
                     /* Indicate that a reduction is occurring */
                     NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
                 }
@@ -4067,6 +4084,8 @@ npyiter_allocate_arrays(NpyIter *iter,
             npy_intp stride = NAD_STRIDES(axisdata)[iiter];
 
             if (stride != op_dtype[iiter]->elsize) {
+                NPY_IT_DBG_PRINTF("Iterator: Setting NPY_OP_ITFLAG_CAST "
+                                    "because of NPY_ITER_CONTIG\n");
                 op_itflags[iiter] |= NPY_OP_ITFLAG_CAST;
                 if (!(itflags&NPY_ITFLAG_BUFFER)) {
                     PyErr_SetString(PyExc_TypeError,
@@ -4142,7 +4161,9 @@ npyiter_allocate_arrays(NpyIter *iter,
             }
             else if (!is_one_to_one &&
                         (op_itflags[iiter]&NPY_OP_ITFLAG_WRITE) &&
-                        !(itflags&NPY_ITFLAG_REDUCE)) {
+                        !(flags&NPY_ITER_REDUCE_OK)) {
+            NPY_IT_DBG_PRINTF("Iterator: %d %d %d\n",
+                            (int)(!is_one_to_one), (int)((op_itflags[iiter]&NPY_OP_ITFLAG_WRITE)), (int)(!(flags&NPY_ITER_REDUCE_OK)));
                 PyErr_SetString(PyExc_ValueError,
                         "Iterator operand requires write buffering, "
                         "but has dimensions which have been broadcasted "
@@ -4856,6 +4877,8 @@ NpyIter_DebugPrint(NpyIter *iter)
         printf("DELAYBUF ");
     if (itflags&NPY_ITFLAG_NEEDSAPI)
         printf("NEEDSAPI ");
+    if (itflags&NPY_ITFLAG_REDUCE)
+        printf("REDUCE ");
     printf("\n");
     printf("NDim: %d\n", (int)ndim);
     printf("NIter: %d\n", (int)niter);
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 5937c6e9a..62f10693c 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -208,6 +208,7 @@ class TestEinSum(TestCase):
         assert_raises(ValueError, np.einsum, "ij->jij", [[0,0],[0,0]])
 
         # dimensions much match when being collapsed
+        assert_raises(ValueError, np.einsum, "ii", np.arange(6).reshape(2,3))
         assert_raises(ValueError, np.einsum, "ii->i", np.arange(6).reshape(2,3))
 
     def test_einsum_views(self):