21 files changed, 699 insertions, 356 deletions
diff --git a/.travis.yml b/.travis.yml
index 4632fbffe..6b010e58f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,11 +39,13 @@ python:
 matrix:
   include:
     - python: 3.6
-      env: USE_CHROOT=1 ARCH=i386 DIST=artful PYTHON=3.6
+      env: USE_CHROOT=1 ARCH=i386 DIST=bionic PYTHON=3.6
       sudo: true
       addons:
         apt:
+          update: true
           packages:
+            - dpkg
             - debootstrap
     - python: 3.4
       env: USE_DEBUG=1
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index 23e0284c4..5f72149e9 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -153,6 +153,12 @@ C API changes
   checked before the operation whose status we wanted to check was run.
   See `#10339 <https://github.com/numpy/numpy/issues/10370>`__.
 
+* ``PyArray_GetDTypeTransferFunction`` now defaults to using user-defined
+  ``copyswapn`` / ``copyswap`` for user-defined dtypes. If this causes a
+  significant performance hit, consider implementing ``copyswapn`` to reflect
+  the implementation of ``PyArray_GetStridedCopyFn``.
+  See `#10898 <https://github.com/numpy/numpy/pull/10898>`__.
+
 New Features
 ============
 
@@ -369,5 +375,22 @@ inner-product example, ``keepdims=True, axes=[-2, -2, -2]`` would act on the
 one-but-last dimension of the input arguments, and leave a size 1 dimension in
 that place in the output.
 
+New ``np.take_along_axis`` and ``np.put_along_axis`` functions
+--------------------------------------------------------------
+When used on multidimensional arrays, ``argsort``, ``argmin``, ``argmax``, and
+``argpartition`` return arrays that are difficult to use as indices.
+``take_along_axis`` provides an easy way to use these indices to lookup values
+within an array, so that::
+
+    np.take_along_axis(a, np.argsort(a, axis=axis), axis=axis)
+
+is the same as::
+
+    np.sort(a, axis=axis)
+
+``np.put_along_axis`` acts as the dual operation for writing to these indices
+within an array.
+
+
 Changes
 =======
diff --git a/doc/source/reference/routines.indexing.rst b/doc/source/reference/routines.indexing.rst
index 4d2458d2f..aeec1a1bb 100644
--- a/doc/source/reference/routines.indexing.rst
+++ b/doc/source/reference/routines.indexing.rst
@@ -36,6 +36,7 @@ Indexing-like operations
    :toctree: generated/
 
    take
+   take_along_axis
    choose
    compress
    diag
@@ -50,6 +51,7 @@ Inserting data into arrays
 
    place
    put
+   put_along_axis
    putmask
    fill_diagonal
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 0db5663f9..d1aae0aa0 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -140,6 +140,7 @@ def take(a, indices, axis=None, out=None, mode='raise'):
     --------
     compress : Take elements using a boolean mask
     ndarray.take : equivalent method
+    take_along_axis : Take elements by matching the array and the index arrays
 
     Notes
     -----
@@ -478,6 +479,7 @@ def put(a, ind, v, mode='raise'):
     See Also
     --------
     putmask, place
+    put_along_axis : Put elements by matching the array and the index arrays
 
     Examples
     --------
@@ -723,7 +725,9 @@ def argpartition(a, kth, axis=-1, kind='introselect', order=None):
     -------
     index_array : ndarray, int
         Array of indices that partition `a` along the specified axis.
-        In other words, ``a[index_array]`` yields a partitioned `a`.
+        If `a` is one-dimensional, ``a[index_array]`` yields a partitioned `a`.
+        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always
+        yields the partitioned `a`, irrespective of dimensionality.
 
     See Also
     --------
@@ -904,6 +908,8 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
     index_array : ndarray, int
         Array of indices that sort `a` along the specified axis.
         If `a` is one-dimensional, ``a[index_array]`` yields a sorted `a`.
+        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always
+        yields the sorted `a`, irrespective of dimensionality.
 
     See Also
     --------
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 42f876125..972147bb0 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -182,6 +182,15 @@ npy_strtoull(const char *str, char **endptr, int base)
  *****************************************************************************
  */
 
+#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/*
+ * Disable harmless compiler warning "4116: unnamed type definition in
+ * parentheses" which is caused by the _ALIGN macro.
+ */
+#if defined(_MSC_VER)
+#pragma warning(disable:4116)
+#endif
+
 
 /**begin repeat
  *
@@ -246,8 +255,10 @@ static int
         }
         return -1;
     }
-    if (ap == NULL || PyArray_ISBEHAVED(ap))
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, _ALIGN(@type@)));
         *((@type@ *)ov)=temp;
+    }
     else {
         PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
                                        ap);
@@ -746,7 +757,7 @@ NPY_NO_EXPORT int PyArray_CopyObject(PyArrayObject *, PyObject *);
  */
 NPY_NO_EXPORT int
 _setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
-            npy_intp *offset_p)
+            npy_intp *offset_p, char *dstdata)
 {
     PyObject *key;
     PyObject *tup;
@@ -760,7 +771,8 @@ _setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
     }
 
     ((PyArrayObject_fields *)(arr))->descr = new;
-    if ((new->alignment > 1) && ((offset % new->alignment) != 0)) {
+    if ((new->alignment > 1) && 
+                ((((uintptr_t)dstdata + offset) % new->alignment) != 0)) {
         PyArray_CLEARFLAGS(arr, NPY_ARRAY_ALIGNED);
     }
     else {
@@ -788,7 +800,7 @@ _copy_and_return_void_setitem(PyArray_Descr *dstdescr, char *dstdata,
     if (PyArray_EquivTypes(srcdescr, dstdescr)) {
         for (i = 0; i < names_size; i++) {
             /* neither line can ever fail, in principle */
-            if (_setup_field(i, dstdescr, dummy, &offset)) {
+            if (_setup_field(i, dstdescr, dummy, &offset, dstdata)) {
                 return -1;
             }
             PyArray_DESCR(dummy)->f->copyswap(dstdata + offset,
@@ -858,7 +870,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
                 PyObject *item;
 
                 /* temporarily make ap have only this field */
-                if (_setup_field(i, descr, ap, &offset) == -1) {
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
                     failed = 1;
                     break;
                 }
@@ -880,7 +892,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
 
             for (i = 0; i < names_size; i++) {
                 /* temporarily make ap have only this field */
-                if (_setup_field(i, descr, ap, &offset) == -1) {
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
                     failed = 1;
                     break;
                 }
@@ -4193,17 +4205,6 @@ small_correlate(const char * d_, npy_intp dstride,
  *****************************************************************************
  */
 
-
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
-/*
- * Disable harmless compiler warning "4116: unnamed type definition in
- * parentheses" which is caused by the _ALIGN macro.
- */
-#if defined(_MSC_VER)
-#pragma warning(disable:4116)
-#endif
-
-
 /**begin repeat
  *
  * #from = VOID, STRING, UNICODE#
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 9c27255aa..9f9aa6757 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -3400,6 +3400,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
 {
     npy_intp src_itemsize, dst_itemsize;
     int src_type_num, dst_type_num;
+    int is_builtin;
 
 #if NPY_DT_DBG_TRACING
     printf("Calculating dtype transfer from ");
@@ -3439,6 +3440,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
     dst_itemsize = dst_dtype->elsize;
     src_type_num = src_dtype->type_num;
     dst_type_num = dst_dtype->type_num;
+    is_builtin = src_type_num < NPY_NTYPES && dst_type_num < NPY_NTYPES;
 
     /* Common special case - number -> number NBO cast */
     if (PyTypeNum_ISNUMBER(src_type_num) &&
@@ -3462,13 +3464,14 @@ PyArray_GetDTypeTransferFunction(int aligned,
     }
 
     /*
-     * If there are no references and the data types are equivalent,
+     * If there are no references and the data types are equivalent and builtin,
      * return a simple copy
      */
     if (PyArray_EquivTypes(src_dtype, dst_dtype) &&
             !PyDataType_REFCHK(src_dtype) && !PyDataType_REFCHK(dst_dtype) &&
             ( !PyDataType_HASFIELDS(dst_dtype) ||
-              is_dtype_struct_simple_unaligned_layout(dst_dtype)) ) {
+              is_dtype_struct_simple_unaligned_layout(dst_dtype)) &&
+            is_builtin) {
         /*
          * We can't pass through the aligned flag because it's not
          * appropriate. Consider a size-8 string, it will say it's
@@ -3494,7 +3497,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                 !PyDataType_HASSUBARRAY(dst_dtype) &&
                 src_type_num != NPY_DATETIME && src_type_num != NPY_TIMEDELTA) {
         /* A custom data type requires that we use its copy/swap */
-        if (src_type_num >= NPY_NTYPES || dst_type_num >= NPY_NTYPES) {
+        if (!is_builtin) {
             /*
              * If the sizes and kinds are identical, but they're different
              * custom types, then get a cast function
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 470a5fff9..0eab25299 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -1776,138 +1776,94 @@ get_sum_of_products_function(int nop, int type_num,
     return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
 }
 
+
 /*
- * Parses the subscripts for one operand into an output
- * of 'ndim' labels
+ * Parses the subscripts for one operand into an output of 'ndim'
+ * labels. The resulting 'op_labels' array will have:
+ *  - the ASCII code of the label for the first occurrence of a label;
+ *  - the (negative) offset to the first occurrence of the label for
+ *    repeated labels;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ * For example:
+ *  - subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]
+ *  - subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]
  */
+
 static int
 parse_operand_subscripts(char *subscripts, int length,
-                        int ndim,
-                        int iop, char *out_labels,
-                        char *out_label_counts,
-                        int *out_min_label,
-                        int *out_max_label,
-                        int *out_num_labels)
+                         int ndim, int iop, char *op_labels,
+                         char *label_counts, int *min_label, int *max_label)
 {
-    int i, idim, ndim_left, label;
-    int ellipsis = 0;
+    int i;
+    int idim = 0;
+    int ellipsis = -1;
 
-    /* Process the labels from the end until the ellipsis */
-    idim = ndim-1;
-    for (i = length-1; i >= 0; --i) {
-        label = subscripts[i];
-        /* A label for an axis */
+    /* Process all labels for this operand */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
         if (label > 0 && isalpha(label)) {
-            if (idim >= 0) {
-                out_labels[idim--] = label;
-                /* Calculate the min and max labels */
-                if (label < *out_min_label) {
-                    *out_min_label = label;
-                }
-                if (label > *out_max_label) {
-                    *out_max_label = label;
-                }
-                /* If it's the first time we see this label, count it */
-                if (out_label_counts[label] == 0) {
-                    (*out_num_labels)++;
-                }
-                out_label_counts[label]++;
-            }
-            else {
+            /* Check we don't exceed the operator dimensions. */
+            if (idim >= ndim) {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains "
-                            "too many subscripts for operand %d", iop);
-                return 0;
+                             "einstein sum subscripts string contains "
+                             "too many subscripts for operand %d", iop);
+                return -1;
+            }
+
+            op_labels[idim++] = label;
+            if (label < *min_label) {
+                *min_label = label;
             }
+            if (label > *max_label) {
+                *max_label = label;
+            }
+            label_counts[label]++;
         }
-        /* The end of the ellipsis */
+        /* The beginning of the ellipsis. */
         else if (label == '.') {
-            /* A valid ellipsis */
-            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
-                ellipsis = 1;
-                length = i-2;
-                break;
-            }
-            else {
+            /* Check it's a proper ellipsis. */
+            if (ellipsis != -1 || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') in "
-                            "operand %d", iop);
-                return 0;
-
+                             "einstein sum subscripts string contains a "
+                             "'.' that is not part of an ellipsis ('...') "
+                             "in operand %d", iop);
+                return -1;
             }
+
+            ellipsis = idim;
         }
         else if (label != ' ') {
             PyErr_Format(PyExc_ValueError,
-                        "invalid subscript '%c' in einstein sum "
-                        "subscripts string, subscripts must "
-                        "be letters", (char)label);
-            return 0;
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
         }
     }
 
-    if (!ellipsis && idim != -1) {
-        PyErr_Format(PyExc_ValueError,
-                    "operand has more dimensions than subscripts "
-                    "given in einstein sum, but no '...' ellipsis "
-                    "provided to broadcast the extra dimensions.");
-        return 0;
-    }
-
-    /* Reduce ndim to just the dimensions left to fill at the beginning */
-    ndim_left = idim+1;
-    idim = 0;
-
-    /*
-     * If we stopped because of an ellipsis, start again from the beginning.
-     * The length was truncated to end at the ellipsis in this case.
-     */
-    if (i > 0) {
-        for (i = 0; i < length; ++i) {
-            label = subscripts[i];
-            /* A label for an axis */
-            if (label > 0 && isalnum(label)) {
-                if (idim < ndim_left) {
-                    out_labels[idim++] = label;
-                    /* Calculate the min and max labels */
-                    if (label < *out_min_label) {
-                        *out_min_label = label;
-                    }
-                    if (label > *out_max_label) {
-                        *out_max_label = label;
-                    }
-                    /* If it's the first time we see this label, count it */
-                    if (out_label_counts[label] == 0) {
-                        (*out_num_labels)++;
-                    }
-                    out_label_counts[label]++;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                "einstein sum subscripts string contains "
-                                "too many subscripts for operand %d", iop);
-                    return 0;
-                }
-            }
-            else if (label == '.') {
-                PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') in "
-                            "operand %d", iop);
-            }
-            else if (label != ' ') {
-                PyErr_Format(PyExc_ValueError,
-                            "invalid subscript '%c' in einstein sum "
-                            "subscripts string, subscripts must "
-                            "be letters", (char)label);
-                return 0;
-            }
+    /* No ellipsis found, labels must match dimensions exactly. */
+    if (ellipsis == -1) {
+        if (idim != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                         "operand has more dimensions than subscripts "
+                         "given in einstein sum, but no '...' ellipsis "
+                         "provided to broadcast the extra dimensions.");
+            return -1;
         }
     }
-
-    /* Set the remaining labels to 0 */
-    while (idim < ndim_left) {
-        out_labels[idim++] = 0;
+    /* Ellipsis found, may have to add broadcast dimensions. */
+    else if (idim < ndim) {
+        /* Move labels after ellipsis to the end. */
+        for (i = 0; i < idim - ellipsis; ++i) {
+            op_labels[ndim - i - 1] = op_labels[idim - i - 1];
+        }
+        /* Set all broadcast dimensions to zero. */
+        for (i = 0; i < ndim - idim; ++i) {
+            op_labels[ellipsis + i] = 0;
+        }
     }
 
     /*
@@ -1918,158 +1874,116 @@ parse_operand_subscripts(char *subscripts, int length,
      * twos complement arithmetic the char is ok either way here, and
      * later where it matters the char is cast to a signed char.
      */
-    for (idim = 0; idim  < ndim-1; ++idim) {
-        char *next;
-        /* If this is a proper label, find any duplicates of it */
-        label = out_labels[idim];
+    for (idim = 0; idim < ndim - 1; ++idim) {
+        int label = op_labels[idim];
+        /* If it is a proper label, find any duplicates of it. */
         if (label > 0) {
-            /* Search for the next matching label */
-            next = (char *)memchr(out_labels+idim+1, label,
-                                    ndim-idim-1);
+            /* Search for the next matching label. */
+            char *next = memchr(op_labels + idim + 1, label, ndim - idim - 1);
+
             while (next != NULL) {
-                /* The offset from next to out_labels[idim] (negative) */
-                *next = (char)((out_labels+idim)-next);
-                /* Search for the next matching label */
-                next = (char *)memchr(next+1, label,
-                                        out_labels+ndim-1-next);
+                /* The offset from next to op_labels[idim] (negative). */
+                *next = (char)((op_labels + idim) - next);
+                /* Search for the next matching label. */
+                next = memchr(next + 1, label, op_labels + ndim - 1 - next);
             }
         }
     }
 
-    return 1;
+    return 0;
 }
 
+
 /*
- * Parses the subscripts for the output operand into an output
- * that requires 'ndim_broadcast' unlabeled dimensions, returning
- * the number of output dimensions.  Returns -1 if there is an error.
+ * Parses the subscripts for the output operand into an output that
+ * includes 'ndim_broadcast' unlabeled dimensions, and returns the total
+ * number of output dimensions, or -1 if there is an error. Similarly
+ * to parse_operand_subscripts, the 'out_labels' array will have, for
+ * each dimension:
+ *  - the ASCII code of the corresponding label;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
  */
 static int
 parse_output_subscripts(char *subscripts, int length,
                         int ndim_broadcast,
-                        const char *label_counts,
-                        char *out_labels)
+                        const char *label_counts, char *out_labels)
 {
-    int i, nlabels, label, idim, ndim, ndim_left;
+    int i, bdim;
+    int ndim = 0;
     int ellipsis = 0;
 
-    /* Count the labels, making sure they're all unique and valid */
-    nlabels = 0;
+    /* Process all the output labels. */
     for (i = 0; i < length; ++i) {
-        label = subscripts[i];
-        if (label > 0 && isalpha(label)) {
-            /* Check if it occurs again */
-            if (memchr(subscripts+i+1, label, length-i-1) == NULL) {
-                /* Check that it was used in the inputs */
-                if (label_counts[label] == 0) {
-                    PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string included "
-                            "output subscript '%c' which never appeared "
-                            "in an input", (char)label);
-                    return -1;
-                }
+        int label = subscripts[i];
 
-                nlabels++;
-            }
-            else {
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check that it doesn't occur again. */
+            if (memchr(subscripts + i + 1, label, length - i - 1) != NULL) {
                 PyErr_Format(PyExc_ValueError,
-                        "einstein sum subscripts string includes "
-                        "output subscript '%c' multiple times",
-                        (char)label);
+                             "einstein sum subscripts string includes "
+                             "output subscript '%c' multiple times",
+                             (char)label);
                 return -1;
             }
-        }
-        else if (label != '.' && label != ' ') {
-            PyErr_Format(PyExc_ValueError,
-                        "invalid subscript '%c' in einstein sum "
-                        "subscripts string, subscripts must "
-                        "be letters", (char)label);
-            return -1;
-        }
-    }
-
-    /* The number of output dimensions */
-    ndim = ndim_broadcast + nlabels;
-
-    /* Process the labels from the end until the ellipsis */
-    idim = ndim-1;
-    for (i = length-1; i >= 0; --i) {
-        label = subscripts[i];
-        /* A label for an axis */
-        if (label != '.' && label != ' ') {
-            if (idim >= 0) {
-                out_labels[idim--] = label;
+            /* Check that it was used in the inputs. */
+            if (label_counts[label] == 0) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string included "
+                             "output subscript '%c' which never appeared "
+                             "in an input", (char)label);
+                return -1;
             }
-            else {
+            /* Check that there is room in out_labels for this label. */
+            if (ndim >= NPY_MAXDIMS) {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains "
-                            "too many output subscripts");
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
                 return -1;
             }
+
+            out_labels[ndim++] = label;
         }
-        /* The end of the ellipsis */
+        /* The beginning of the ellipsis. */
         else if (label == '.') {
-            /* A valid ellipsis */
-            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
-                ellipsis = 1;
-                length = i-2;
-                break;
-            }
-            else {
+            /* Check it is a proper ellipsis. */
+            if (ellipsis || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
                 PyErr_SetString(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') "
-                            "in the output");
+                                "einstein sum subscripts string "
+                                "contains a '.' that is not part of "
+                                "an ellipsis ('...') in the output");
                 return -1;
-
             }
-        }
-    }
-
-    if (!ellipsis && idim != -1) {
-        PyErr_SetString(PyExc_ValueError,
-                    "output has more dimensions than subscripts "
-                    "given in einstein sum, but no '...' ellipsis "
-                    "provided to broadcast the extra dimensions.");
-        return 0;
-    }
-
-    /* Reduce ndim to just the dimensions left to fill at the beginning */
-    ndim_left = idim+1;
-    idim = 0;
-
-    /*
-     * If we stopped because of an ellipsis, start again from the beginning.
-     * The length was truncated to end at the ellipsis in this case.
-     */
-    if (i > 0) {
-        for (i = 0; i < length; ++i) {
-            label = subscripts[i];
-            if (label == '.') {
-                PyErr_SetString(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') "
-                            "in the output");
+            /* Check there is room in out_labels for broadcast dims. */
+            if (ndim + ndim_broadcast > NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
                 return -1;
             }
-            /* A label for an axis */
-            else if (label != ' ') {
-                if (idim < ndim_left) {
-                    out_labels[idim++] = label;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                "einstein sum subscripts string contains "
-                                "too many subscripts for the output");
-                    return -1;
-                }
+
+            ellipsis = 1;
+            for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
+                out_labels[ndim++] = 0;
             }
         }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
     }
 
-    /* Set the remaining output labels to 0 */
-    while (idim < ndim_left) {
-        out_labels[idim++] = 0;
+    /* If no ellipsis was found there should be no broadcast dimensions. */
+    if (!ellipsis && ndim_broadcast > 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "output has more dimensions than subscripts "
+                        "given in einstein sum, but no '...' ellipsis "
+                        "provided to broadcast the extra dimensions.");
+        return -1;
     }
 
     return ndim;
@@ -2121,7 +2035,7 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
             if (ibroadcast == ndim_output) {
                 PyErr_SetString(PyExc_ValueError,
                         "output had too few broadcast dimensions");
-                return 0;
+                return -1;
             }
             new_dims[ibroadcast] = PyArray_DIM(op, idim);
             new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
@@ -2144,7 +2058,7 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
                         "index '%c' don't match (%d != %d)",
                         iop, label, (int)new_dims[i],
                         (int)PyArray_DIM(op, idim));
-                return 0;
+                return -1;
             }
             new_dims[i] = PyArray_DIM(op, idim);
             new_strides[i] += PyArray_STRIDE(op, idim);
@@ -2162,14 +2076,14 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
                                 (PyObject *)op);
 
         if (*ret == NULL) {
-            return 0;
+            return -1;
         }
         if (!PyArray_Check(*ret)) {
             Py_DECREF(*ret);
             *ret = NULL;
             PyErr_SetString(PyExc_RuntimeError,
                         "NewFromDescr failed to return an array");
-            return 0;
+            return -1;
         }
         PyArray_UpdateFlags(*ret,
                     NPY_ARRAY_C_CONTIGUOUS|
@@ -2179,14 +2093,14 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
         if (PyArray_SetBaseObject(*ret, (PyObject *)op) < 0) {
             Py_DECREF(*ret);
             *ret = NULL;
-            return 0;
+            return -1;
         }
-        return 1;
+        return 0;
     }
 
     /* Return success, but that we couldn't make a view */
     *ret = NULL;
-    return 1;
+    return 0;
 }
 
 static PyArrayObject *
@@ -2332,7 +2246,7 @@ prepare_op_axes(int ndim, int iop, char *labels, int *axes,
         }
     }
 
-    return 1;
+    return 0;
 }
 
 static int
@@ -2613,7 +2527,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                     NPY_ORDER order, NPY_CASTING casting,
                     PyArrayObject *out)
 {
-    int iop, label, min_label = 127, max_label = 0, num_labels;
+    int iop, label, min_label = 127, max_label = 0;
     char label_counts[128];
     char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
     char output_labels[NPY_MAXDIMS], *iter_labels;
@@ -2644,7 +2558,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
     /* Parse the subscripts string into label_counts and op_labels */
     memset(label_counts, 0, sizeof(label_counts));
-    num_labels = 0;
     for (iop = 0; iop < nop; ++iop) {
         int length = (int)strcspn(subscripts, ",-");
 
@@ -2661,10 +2574,10 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             return NULL;
         }
 
-        if (!parse_operand_subscripts(subscripts, length,
+        if (parse_operand_subscripts(subscripts, length,
                         PyArray_NDIM(op_in[iop]),
                         iop, op_labels[iop], label_counts,
-                        &min_label, &max_label, &num_labels)) {
+                        &min_label, &max_label) < 0) {
             return NULL;
         }
 
@@ -2698,21 +2611,18 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     }
 
     /*
-     * If there is no output signature, create one using each label
-     * that appeared once, in alphabetical order
+     * If there is no output signature, fill output_labels and ndim_output
+     * using each label that appeared once, in alphabetical order.
      */
     if (subscripts[0] == '\0') {
-        char outsubscripts[NPY_MAXDIMS + 3];
-        int length;
-        /* If no output was specified, always broadcast left (like normal) */
-        outsubscripts[0] = '.';
-        outsubscripts[1] = '.';
-        outsubscripts[2] = '.';
-        length = 3;
+        /* If no output was specified, always broadcast left, as usual. */
+        for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
+            output_labels[ndim_output] = 0;
+        }
         for (label = min_label; label <= max_label; ++label) {
             if (label_counts[label] == 1) {
-                if (length < NPY_MAXDIMS-1) {
-                    outsubscripts[length++] = label;
+                if (ndim_output < NPY_MAXDIMS) {
+                    output_labels[ndim_output++] = label;
                 }
                 else {
                     PyErr_SetString(PyExc_ValueError,
@@ -2722,10 +2632,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                 }
             }
         }
-        /* Parse the output subscript string */
-        ndim_output = parse_output_subscripts(outsubscripts, length,
-                                        ndim_broadcast, label_counts,
-                                        output_labels);
     }
     else {
         if (subscripts[0] != '-' || subscripts[1] != '>') {
@@ -2736,13 +2642,13 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         }
         subscripts += 2;
 
-        /* Parse the output subscript string */
+        /* Parse the output subscript string. */
         ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
                                         ndim_broadcast, label_counts,
                                         output_labels);
-    }
-    if (ndim_output < 0) {
-        return NULL;
+        if (ndim_output < 0) {
+            return NULL;
+        }
     }
 
     if (out != NULL && PyArray_NDIM(out) != ndim_output) {
@@ -2776,9 +2682,9 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         if (iop == 0 && nop == 1 && out == NULL) {
             ret = NULL;
 
-            if (!get_single_op_view(op_in[iop], iop, labels,
-                                    ndim_output, output_labels,
-                                    &ret)) {
+            if (get_single_op_view(op_in[iop], iop, labels,
+                                   ndim_output, output_labels,
+                                   &ret) < 0) {
                 return NULL;
             }
 
@@ -2840,8 +2746,8 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     for (iop = 0; iop < nop; ++iop) {
         op_axes[iop] = op_axes_arrays[iop];
 
-        if (!prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
-                    op_axes[iop], ndim_iter, iter_labels)) {
+        if (prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels) < 0) {
             goto fail;
         }
     }
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 120ce0332..76af40439 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -317,13 +317,16 @@ addUfuncs(PyObject *dictionary) {
 static PyObject *
 UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
-    int nin, nout;
+    int nin, nout, i;
     PyObject *signature, *sig_str;
-    PyObject *f;
+    PyUFuncObject *f = NULL;
+    PyObject *core_num_dims = NULL, *core_dim_ixs = NULL;
     int core_enabled;
+    int core_num_ixs = 0;
 
-    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) return NULL;
-
+    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) {
+        return NULL;
+    }
 
     if (PyString_Check(signature)) {
         sig_str = signature;
@@ -334,17 +337,60 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         return NULL;
     }
 
-    f = PyUFunc_FromFuncAndDataAndSignature(NULL, NULL, NULL,
+    f = (PyUFuncObject*)PyUFunc_FromFuncAndDataAndSignature(
+        NULL, NULL, NULL,
         0, nin, nout, PyUFunc_None, "no name",
         "doc:none",
         1, PyString_AS_STRING(sig_str));
     if (sig_str != signature) {
         Py_DECREF(sig_str);
     }
-    if (f == NULL) return NULL;
-    core_enabled = ((PyUFuncObject*)f)->core_enabled;
+    if (f == NULL) {
+        return NULL;
+    }
+    core_enabled = f->core_enabled;
+    /*
+     * Don't presume core_num_dims and core_dim_ixs are defined;
+     * they currently are even if core_enabled=0, but there's no real
+     * reason they should be.  So avoid segfaults if we change our mind.
+     */
+    if (f->core_num_dims != NULL) {
+        core_num_dims = PyTuple_New(f->nargs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->nargs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_num_dims[i]);
+            PyTuple_SET_ITEM(core_num_dims, i, val);
+            core_num_ixs += f->core_num_dims[i];
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_num_dims = Py_None;
+    }
+    if (f->core_dim_ixs != NULL) {
+        core_dim_ixs = PyTuple_New(core_num_ixs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < core_num_ixs; i++) {
+            PyObject * val = PyLong_FromLong(f->core_dim_ixs[i]);
+            PyTuple_SET_ITEM(core_dim_ixs, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_ixs = Py_None;
+    }
     Py_DECREF(f);
-    return Py_BuildValue("i", core_enabled);
+    return Py_BuildValue("iOO", core_enabled, core_num_dims, core_dim_ixs);
+
+fail:
+    Py_XDECREF(f);
+    Py_XDECREF(core_num_dims);
+    Py_XDECREF(core_dim_ixs);
+    return NULL;
 }
 
 static PyMethodDef UMath_TestsMethods[] = {
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index a5b9ce76f..b7fda3f2e 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -285,10 +285,16 @@ class TestUfunc(object):
     def test_signature(self):
         # the arguments to test_signature are: nin, nout, core_signature
         # pass
-        assert_equal(umt.test_signature(2, 1, "(i),(i)->()"), 1)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(i),(i)->()")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1,  1,  0))
+        assert_equal(ixs, (0, 0))
 
-        # pass. empty core signature; treat as plain ufunc (with trivial core)
-        assert_equal(umt.test_signature(2, 1, "(),()->()"), 0)
+        # empty core signature; treat as plain ufunc (with trivial core)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(),()->()")
+        assert_equal(enabled, 0)
+        assert_equal(num_dims, (0,  0,  0))
+        assert_equal(ixs, ())
 
         # in the following calls, a ValueError should be raised because
         # of error in core signature
@@ -327,7 +333,10 @@ class TestUfunc(object):
             pass
 
         # more complicated names for variables
-        assert_equal(umt.test_signature(2, 1, "(i1,i2),(J_1)->(_kAB)"), 1)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(i1,i2),(J_1)->(_kAB)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 1, 1))
+        assert_equal(ixs, (0, 1, 2, 3))
 
     def test_get_signature(self):
         assert_equal(umt.inner1d.signature, "(i),(i)->()")
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index cb7414a04..41f0b1f61 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -256,6 +256,11 @@ def minrelpath(path):
         return ''
     return os.sep.join(l)
 
+def sorted_glob(fileglob):
+    """sorts output of python glob for http://bugs.python.org/issue30461
+    to allow extensions to have reproducible build results"""
+    return sorted(glob.glob(fileglob))
+
 def _fix_paths(paths, local_path, include_non_existing):
     assert is_sequence(paths), repr(type(paths))
     new_paths = []
@@ -263,8 +268,8 @@ def _fix_paths(paths, local_path, include_non_existing):
     for n in paths:
         if is_string(n):
             if '*' in n or '?' in n:
-                p = glob.glob(n)
-                p2 = glob.glob(njoin(local_path, n))
+                p = sorted_glob(n)
+                p2 = sorted_glob(njoin(local_path, n))
                 if p2:
                     new_paths.extend(p2)
                 elif p:
@@ -528,7 +533,7 @@ def _get_headers(directory_list):
     # get *.h files from list of directories
     headers = []
     for d in directory_list:
-        head = glob.glob(os.path.join(d, "*.h")) #XXX: *.hpp files??
+        head = sorted_glob(os.path.join(d, "*.h")) #XXX: *.hpp files??
         headers.extend(head)
     return headers
 
@@ -882,7 +887,7 @@ class Configuration(object):
                                  caller_level = 1):
         l = subpackage_name.split('.')
         subpackage_path = njoin([self.local_path]+l)
-        dirs = [_m for _m in glob.glob(subpackage_path) if os.path.isdir(_m)]
+        dirs = [_m for _m in sorted_glob(subpackage_path) if os.path.isdir(_m)]
         config_list = []
         for d in dirs:
             if not os.path.isfile(njoin(d, '__init__.py')):
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index 43fdc5627..d2139338e 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -201,7 +201,7 @@ class nd_grid(object):
                 slobj = [_nx.newaxis]*len(size)
                 for k in range(len(size)):
                     slobj[k] = slice(None, None)
-                    nn[k] = nn[k][slobj]
+                    nn[k] = nn[k][tuple(slobj)]
                     slobj[k] = _nx.newaxis
             return nn
         except (IndexError, TypeError):
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 41ef28ef3..65104115a 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -16,10 +16,235 @@ from numpy.matrixlib.defmatrix import matrix  # this raises all the right alarm
 __all__ = [
     'column_stack', 'row_stack', 'dstack', 'array_split', 'split',
     'hsplit', 'vsplit', 'dsplit', 'apply_over_axes', 'expand_dims',
-    'apply_along_axis', 'kron', 'tile', 'get_array_wrap'
+    'apply_along_axis', 'kron', 'tile', 'get_array_wrap', 'take_along_axis',
+    'put_along_axis'
     ]
 
 
+def _make_along_axis_idx(arr_shape, indices, axis):
+	# compute dimensions to iterate over
+    if not _nx.issubdtype(indices.dtype, _nx.integer):
+        raise IndexError('`indices` must be an integer array')
+    if len(arr_shape) != indices.ndim:
+        raise ValueError(
+            "`indices` and `arr` must have the same number of dimensions")
+    shape_ones = (1,) * indices.ndim
+    dest_dims = list(range(axis)) + [None] + list(range(axis+1, indices.ndim))
+
+    # build a fancy index, consisting of orthogonal aranges, with the
+    # requested index inserted at the right location
+    fancy_index = []
+    for dim, n in zip(dest_dims, arr_shape):
+        if dim is None:
+            fancy_index.append(indices)
+        else:
+            ind_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(_nx.arange(n).reshape(ind_shape))
+
+    return tuple(fancy_index)
+
+
+def take_along_axis(arr, indices, axis):
+    """
+    Take values from the input array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to look up values in the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Source array
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to take along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions Ni and Nj only need to broadcast
+        against `arr`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the input array is
+        treated as if it had first been flattened to 1d, for consistency with
+        `sort` and `argsort`.
+
+    Returns
+    -------
+    out: ndarray (Ni..., J, Nk...)
+        The indexed result.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+        out = np.empty(Nk + (J,) + Nk)
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                out_1d     = out    [ii + s_[:,] + kk]
+                for j in range(J):
+                    out_1d[j] = a_1d[indices_1d[j]]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                out_1d[:] = a_1d[indices_1d]
+
+    See Also
+    --------
+    take : Take along an axis, using the same indices for every 1d slice
+    put_along_axis :
+        Put values into the destination array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can sort either by using sort directly, or argsort and this function
+
+    >>> np.sort(a, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+    >>> ai = np.argsort(a, axis=1); ai
+    array([[0, 2, 1],
+           [1, 2, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+
+    The same works for max and min, if you expand the dimensions:
+
+    >>> np.expand_dims(np.max(a, axis=1), axis=1)
+    array([[30],
+           [60]])
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[30],
+           [60]])
+
+    If we want to get the max and min at the same time, we can stack the
+    indices first
+
+    >>> ai_min = np.expand_dims(np.argmin(a, axis=1), axis=1)
+    >>> ai_max = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai = np.concatenate([ai_min, ai_max], axis=axis)
+    >> ai
+    array([[0, 1],
+           [1, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 30],
+           [40, 60]])
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        arr_shape = (len(arr),)  # flatiter has no .shape
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    return arr[_make_along_axis_idx(arr_shape, indices, axis)]
+
+
+def put_along_axis(arr, indices, values, axis):
+    """
+    Put values into the destination array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to place values into the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Destination array.
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to change along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions in Ni and Nj may be 1 to broadcast
+        against `arr`.
+    values: array_like (Ni..., J, Nk...)
+        values to insert at those indices. Its shape and dimension are
+        broadcast to match that of `indices`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the destination
+        array is treated as if a flattened 1d view had been created of it.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                values_1d  = values [ii + s_[:,] + kk]
+                for j in range(J):
+                    a_1d[indices_1d[j]] = values_1d[j]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                a_1d[indices_1d] = values_1d
+
+    See Also
+    --------
+    take_along_axis :
+        Take values from the input array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can replace the maximum values with:
+
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0]], dtype=int64)
+    >>> np.put_along_axis(a, ai, 99, axis=1)
+    >>> a
+    array([[10, 99, 20],
+           [99, 40, 50]])
+
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        axis = 0
+        arr_shape = (len(arr),)  # flatiter has no .shape
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    arr[_make_along_axis_idx(arr_shape, indices, axis)] = values
+
+
 def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     """
     Apply a function to 1-D slices along the given axis.
diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py
index 089a7589a..315251daa 100644
--- a/numpy/lib/tests/test_index_tricks.py
+++ b/numpy/lib/tests/test_index_tricks.py
@@ -6,7 +6,7 @@ from numpy.testing import (
     assert_array_almost_equal, assert_raises, assert_raises_regex
     )
 from numpy.lib.index_tricks import (
-    mgrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
+    mgrid, ogrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
     index_exp, ndindex, r_, s_, ix_
     )
 
@@ -156,6 +156,15 @@ class TestGrid(object):
         assert_array_almost_equal(d[1, :, 1] - d[1, :, 0],
                                   0.2*np.ones(20, 'd'), 11)
 
+    def test_sparse(self):
+        grid_full   = mgrid[-1:1:10j, -2:2:10j]
+        grid_sparse = ogrid[-1:1:10j, -2:2:10j]
+
+        # sparse grids can be made dense by broadcasting
+        grid_broadcast = np.broadcast_arrays(*grid_sparse)
+        for f, b in zip(grid_full, grid_broadcast):
+            assert_equal(f, b)
+
 
 class TestConcatenator(object):
     def test_1d(self):
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index a35d90b70..c95894f94 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -2,16 +2,106 @@ from __future__ import division, absolute_import, print_function
 
 import numpy as np
 import warnings
+import functools
 
 from numpy.lib.shape_base import (
     apply_along_axis, apply_over_axes, array_split, split, hsplit, dsplit,
-    vsplit, dstack, column_stack, kron, tile, expand_dims,
+    vsplit, dstack, column_stack, kron, tile, expand_dims, take_along_axis,
+    put_along_axis
     )
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises, assert_warns
     )
 
 
+def _add_keepdims(func):
+    """ hack in keepdims behavior into a function taking an axis """
+    @functools.wraps(func)
+    def wrapped(a, axis, **kwargs):
+        res = func(a, axis=axis, **kwargs)
+        if axis is None:
+            axis = 0  # res is now a scalar, so we can insert this anywhere
+        return np.expand_dims(res, axis=axis)
+    return wrapped
+
+
+class TestTakeAlongAxis(object):
+    def test_argequivalent(self):
+        """ Test it translates from arg<func> to <func> """
+        from numpy.random import rand
+        a = rand(3, 4, 5)
+
+        funcs = [
+            (np.sort, np.argsort, dict()),
+            (_add_keepdims(np.min), _add_keepdims(np.argmin), dict()),
+            (_add_keepdims(np.max), _add_keepdims(np.argmax), dict()),
+            (np.partition, np.argpartition, dict(kth=2)),
+        ]
+
+        for func, argfunc, kwargs in funcs:
+            for axis in list(range(a.ndim)) + [None]:
+                a_func = func(a, axis=axis, **kwargs)
+                ai_func = argfunc(a, axis=axis, **kwargs)
+                assert_equal(a_func, take_along_axis(a, ai_func, axis=axis))
+
+    def test_invalid(self):
+        """ Test it errors when indices has too few dimensions """
+        a = np.ones((10, 10))
+        ai = np.ones((10, 2), dtype=np.intp)
+
+        # sanity check
+        take_along_axis(a, ai, axis=1)
+
+        # not enough indices
+        assert_raises(ValueError, take_along_axis, a, np.array(1), axis=1)
+        # bool arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(bool), axis=1)
+        # float arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(float), axis=1)
+        # invalid axis
+        assert_raises(np.AxisError, take_along_axis, a, ai, axis=10)
+
+    def test_empty(self):
+        """ Test everything is ok with empty results, even with inserted dims """
+        a  = np.ones((3, 4, 5))
+        ai = np.ones((3, 0, 5), dtype=np.intp)
+
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, ai.shape)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.ones((1, 2, 5), dtype=np.intp)
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, (3, 2, 5))
+
+
+class TestPutAlongAxis(object):
+    def test_replace_max(self):
+        a_base = np.array([[10, 30, 20], [60, 40, 50]])
+
+        for axis in list(range(a_base.ndim)) + [None]:
+            # we mutate this in the loop
+            a = a_base.copy()
+
+            # replace the max with a small value
+            i_max = _add_keepdims(np.argmax)(a, axis=axis)
+            put_along_axis(a, i_max, -99, axis=axis)
+
+            # find the new minimum, which should max
+            i_min = _add_keepdims(np.argmin)(a, axis=axis)
+
+            assert_equal(i_min, i_max)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.arange(10, dtype=np.intp).reshape((1, 2, 5)) % 4
+        put_along_axis(a, ai, 20, axis=1)
+        assert_equal(take_along_axis(a, ai, axis=1), 20)
+
+
 class TestApplyAlongAxis(object):
     def test_simple(self):
         a = np.ones((20, 10), 'd')
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 5757b1827..98af0733b 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -210,7 +210,8 @@ def _assertSquareness(*arrays):
 
 def _assertNdSquareness(*arrays):
     for a in arrays:
-        if max(a.shape[-2:]) != min(a.shape[-2:]):
+        m, n = a.shape[-2:]
+        if m != n:
             raise LinAlgError('Last 2 dimensions of the array must be square')
 
 def _assertFinite(*arrays):
@@ -1532,8 +1533,7 @@ def svd(a, full_matrices=True, compute_uv=True):
 
     extobj = get_linalg_error_extobj(_raise_linalgerror_svd_nonconvergence)
 
-    m = a.shape[-2]
-    n = a.shape[-1]
+    m, n = a.shape[-2:]
     if compute_uv:
         if full_matrices:
             if m < n:
@@ -1853,7 +1853,8 @@ def pinv(a, rcond=1e-15 ):
     a, wrap = _makearray(a)
     rcond = asarray(rcond)
     if _isEmpty2d(a):
-        res = empty(a.shape[:-2] + (a.shape[-1], a.shape[-2]), dtype=a.dtype)
+        m, n = a.shape[-2:]
+        res = empty(a.shape[:-2] + (n, m), dtype=a.dtype)
         return wrap(res)
     a = a.conjugate()
     u, s, vt = svd(a, full_matrices=False)
@@ -2110,10 +2111,9 @@ def lstsq(a, b, rcond="warn"):
         b = b[:, newaxis]
     _assertRank2(a, b)
     _assertNoEmpty2d(a, b)  # TODO: relax this constraint
-    m  = a.shape[0]
-    n  = a.shape[1]
-    n_rhs = b.shape[1]
-    if m != b.shape[0]:
+    m, n = a.shape[-2:]
+    m2, n_rhs = b.shape[-2:]
+    if m != m2:
         raise LinAlgError('Incompatible dimensions')
 
     t, result_t = _commonType(a, b)
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 3a5ad7250..7dc1cb0cb 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3202,7 +3202,7 @@ static void
                     for (i = 0; i < nrhs; i++) {
                         @ftyp@ *vector = components + i*m;
                         /* Numpy and fortran floating types are the same size,
-                         * so this case is safe */
+                         * so this cast is safe */
                         @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
                         memcpy(
                             resid + i*r_out.column_strides,
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 17682d13f..5ed086db3 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -2992,7 +2992,9 @@ class MaskedArray(ndarray):
                     order = "K"
 
                 _mask = _mask.astype(_mask_dtype, order)
-
+            else:
+                # Take a view so shape changes, etc., do not propagate back.
+                _mask = _mask.view()
         else:
             _mask = nomask
 
@@ -3337,17 +3339,35 @@ class MaskedArray(ndarray):
             _mask[indx] = mindx
         return
 
-    def __setattr__(self, attr, value):
-        super(MaskedArray, self).__setattr__(attr, value)
-        if attr == 'dtype' and self._mask is not nomask:
-            self._mask = self._mask.view(make_mask_descr(value), ndarray)
-            # Try to reset the shape of the mask (if we don't have a void)
-            # This raises a ValueError if the dtype change won't work
+    # Define so that we can overwrite the setter.
+    @property
+    def dtype(self):
+        return super(MaskedArray, self).dtype
+
+    @dtype.setter
+    def dtype(self, dtype):
+        super(MaskedArray, type(self)).dtype.__set__(self, dtype)
+        if self._mask is not nomask:
+            self._mask = self._mask.view(make_mask_descr(dtype), ndarray)
+            # Try to reset the shape of the mask (if we don't have a void).
+            # This raises a ValueError if the dtype change won't work.
             try:
                 self._mask.shape = self.shape
             except (AttributeError, TypeError):
                 pass
 
+    @property
+    def shape(self):
+        return super(MaskedArray, self).shape
+
+    @shape.setter
+    def shape(self, shape):
+        super(MaskedArray, type(self)).shape.__set__(self, shape)
+        # Cannot use self._mask, since it may not (yet) exist when a
+        # masked matrix sets the shape.
+        if getmask(self) is not nomask:
+            self._mask.shape = self.shape
+
     def __setmask__(self, mask, copy=False):
         """
         Set the mask.
@@ -5524,15 +5544,7 @@ class MaskedArray(ndarray):
         sidx = self.argsort(axis=axis, kind=kind, order=order,
                             fill_value=fill_value, endwith=endwith)
 
-        # save memory for 1d arrays
-        if self.ndim == 1:
-            idx = sidx
-        else:
-            idx = list(np.ix_(*[np.arange(x) for x in self.shape]))
-            idx[axis] = sidx
-            idx = tuple(idx)
-
-        self[...] = self[idx]
+        self[...] = np.take_along_axis(self, sidx, axis=axis)
 
     def min(self, axis=None, out=None, fill_value=None, keepdims=np._NoValue):
         """
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index da35217d1..3be4d3625 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -747,19 +747,17 @@ def _median(a, axis=None, out=None, overwrite_input=False):
             return np.ma.minimum_fill_value(asorted)
         return s
 
-    counts = count(asorted, axis=axis)
+    counts = count(asorted, axis=axis, keepdims=True)
     h = counts // 2
 
-    # create indexing mesh grid for all but reduced axis
-    axes_grid = [np.arange(x) for i, x in enumerate(asorted.shape)
-                 if i != axis]
-    ind = np.meshgrid(*axes_grid, sparse=True, indexing='ij')
+    # duplicate high if odd number of elements so mean does nothing
+    odd = counts % 2 == 1
+    l = np.where(odd, h, h-1)
 
-    # insert indices of low and high median
-    ind.insert(axis, h - 1)
-    low = asorted[tuple(ind)]
-    ind[axis] = np.minimum(h, asorted.shape[axis] - 1)
-    high = asorted[tuple(ind)]
+    lh = np.concatenate([l,h], axis=axis)
+
+    # get low and high median
+    low_high = np.take_along_axis(asorted, lh, axis=axis)
 
     def replace_masked(s):
         # Replace masked entries with minimum_full_value unless it all values
@@ -767,30 +765,20 @@ def _median(a, axis=None, out=None, overwrite_input=False):
         # larger than the fill value is undefined and a valid value placed
         # elsewhere, e.g. [4, --, inf].
         if np.ma.is_masked(s):
-            rep = (~np.all(asorted.mask, axis=axis)) & s.mask
+            rep = (~np.all(asorted.mask, axis=axis, keepdims=True)) & s.mask
             s.data[rep] = np.ma.minimum_fill_value(asorted)
             s.mask[rep] = False
 
-    replace_masked(low)
-    replace_masked(high)
-
-    # duplicate high if odd number of elements so mean does nothing
-    odd = counts % 2 == 1
-    np.copyto(low, high, where=odd)
-    # not necessary for scalar True/False masks
-    try:
-        np.copyto(low.mask, high.mask, where=odd)
-    except Exception:
-        pass
+    replace_masked(low_high)
 
     if np.issubdtype(asorted.dtype, np.inexact):
         # avoid inf / x = masked
-        s = np.ma.sum([low, high], axis=0, out=out)
+        s = np.ma.sum(low_high, axis=axis, out=out)
         np.true_divide(s.data, 2., casting='unsafe', out=s.data)
 
         s = np.lib.utils._median_nancheck(asorted, s, axis, out)
     else:
-        s = np.ma.mean([low, high], axis=0, out=out)
+        s = np.ma.mean(low_high, axis=axis, out=out)
 
     return s
 
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 4c7440aab..51616f214 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -352,9 +352,11 @@ class TestMaskedArray(object):
         assert_equal(y1._mask.__array_interface__, m.__array_interface__)
 
         y1a = array(y1)
+        # Default for masked array is not to copy; see gh-10318.
         assert_(y1a._data.__array_interface__ ==
                         y1._data.__array_interface__)
-        assert_(y1a.mask is y1.mask)
+        assert_(y1a._mask.__array_interface__ ==
+                        y1._mask.__array_interface__)
 
         y2 = array(x1, mask=m3)
         assert_(y2._data.__array_interface__ == x1.__array_interface__)
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 70eab0edc..d7b1e3c18 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -273,7 +273,11 @@ class TestMa(object):
         assert_(y1.mask is m)
 
         y1a = array(y1, copy=0)
-        assert_(y1a.mask is y1.mask)
+        # For copy=False, one might expect that the array would just
+        # passed on, i.e., that it would be "is" instead of "==".
+        # See gh-4043 for discussion.
+        assert_(y1a._mask.__array_interface__ ==
+                y1._mask.__array_interface__)
 
         y2 = array(x1, mask=m3, copy=0)
         assert_(y2.mask is m3)
diff --git a/numpy/ma/tests/test_regression.py b/numpy/ma/tests/test_regression.py
index 04e10d9d1..96c418a51 100644
--- a/numpy/ma/tests/test_regression.py
+++ b/numpy/ma/tests/test_regression.py
@@ -74,3 +74,13 @@ class TestRegression(object):
             r1 = np.ma.corrcoef(x, y, ddof=1)
             # ddof should not have an effect (it gets cancelled out)
             assert_allclose(r0.data, r1.data)
+
+    def test_mask_not_backmangled(self):
+        # See gh-10314.  Test case taken from gh-3140.
+        a = np.ma.MaskedArray([1., 2.], mask=[False, False])
+        assert_(a.mask.shape == (2,))
+        b = np.tile(a, (2, 1))
+        # Check that the above no longer changes a.shape to (1, 2)
+        assert_(a.mask.shape == (2,))
+        assert_(b.shape == (2, 2))
+        assert_(b.mask.shape == (2, 2))