80 files changed, 2385 insertions, 1172 deletions
diff --git a/numpy/__init__.py b/numpy/__init__.py
index d10a1ecd3..d250ed5ac 100644
--- a/numpy/__init__.py
+++ b/numpy/__init__.py
@@ -194,3 +194,28 @@ else:
     from numpy.testing._private.pytesttester import PytestTester
     test = PytestTester(__name__)
     del PytestTester
+
+
+    def _sanity_check():
+        """
+        Quick sanity checks for common bugs caused by environment.
+        There are some cases e.g. with wrong BLAS ABI that cause wrong
+        results under specific runtime conditions that are not necessarily
+        achieved during test suite runs, and it is useful to catch those early.
+
+        See https://github.com/numpy/numpy/issues/8577 and other
+        similar bug reports.
+
+        """
+        try:
+            x = ones(2, dtype=float32)
+            if not abs(x.dot(x) - 2.0) < 1e-5:
+                raise AssertionError()
+        except AssertionError:
+            msg = ("The current Numpy installation ({!r}) fails to "
+                   "pass simple sanity checks. This can be caused for example "
+                   "by incorrect BLAS library being linked in.")
+            raise RuntimeError(msg.format(__file__))
+
+    _sanity_check()
+    del _sanity_check
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 947004001..9372b3431 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -4759,6 +4759,11 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('tofile',
     machines with different endianness. Some of these problems can be overcome
     by outputting the data as text files, at the expense of speed and file
     size.
+    
+    When fid is a file object, array contents are directly written to the
+    file, bypassing the file object's ``write`` method. As a result, tofile
+    cannot be used with files objects supporting compression (e.g., GzipFile)
+    or file-like objects that do not support ``fileno()`` (e.g., BytesIO).
 
     """))
 
@@ -5602,6 +5607,37 @@ add_newdoc('numpy.core.multiarray', 'unpackbits',
 
     """)
 
+add_newdoc('numpy.core._multiarray_tests', 'format_float_OSprintf_g',
+    """
+    format_float_OSprintf_g(val, precision)
+
+    Print a floating point scalar using the system's printf function,
+    equivalent to:
+
+        printf("%.*g", precision, val);
+
+    for half/float/double, or replacing 'g' by 'Lg' for longdouble. This
+    method is designed to help cross-validate the format_float_* methods.
+
+    Parameters
+    ----------
+    val : python float or numpy floating scalar
+        Value to format.
+
+    precision : non-negative integer, optional
+        Precision given to printf.
+
+    Returns
+    -------
+    rep : string
+        The string representation of the floating point value
+
+    See Also
+    --------
+    format_float_scientific
+    format_float_positional
+    """)
+
 
 ##############################################################################
 #
@@ -5647,10 +5683,13 @@ add_newdoc('numpy.core', 'ufunc',
         Alternate array object(s) in which to put the result; if provided, it
         must have a shape that the inputs broadcast to. A tuple of arrays
         (possible only as a keyword argument) must have length equal to the
-        number of outputs; use `None` for outputs to be allocated by the ufunc.
+        number of outputs; use `None` for uninitialized outputs to be
+        allocated by the ufunc.
     where : array_like, optional
         Values of True indicate to calculate the ufunc at that position, values
-        of False indicate to leave the value in the output alone.
+        of False indicate to leave the value in the output alone.  Note that if
+        an uninitialized return array is created via the default ``out=None``,
+        then the elements where the values are False will remain uninitialized.
     **kwargs
         For other keyword-only arguments, see the :ref:`ufunc docs <ufuncs.kwargs>`.
 
@@ -5658,7 +5697,8 @@ add_newdoc('numpy.core', 'ufunc',
     -------
     r : ndarray or tuple of ndarray
         `r` will have the shape that the arrays in `x` broadcast to; if `out` is
-        provided, `r` will be equal to `out`. If the function has more than one
+        provided, it will be returned. If not, `r` will be allocated and
+        may contain uninitialized values. If the function has more than one
         output, then the result will be a tuple of arrays.
 
     """)
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 112af9a34..632bcb41f 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -972,7 +972,7 @@ def make_arrays(funcdict):
                     for vt in t.simd:
                         code2list.append(textwrap.dedent("""\
                         #ifdef HAVE_ATTRIBUTE_TARGET_{ISA}
-                        if (npy_cpu_supports("{ISA}")) {{
+                        if (npy_cpu_supports("{isa}")) {{
                             {fname}_functions[{idx}] = {type}_{fname}_{isa};
                         }}
                         #endif
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index bd90d0460..f7d58a26f 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -3365,7 +3365,7 @@ add_newdoc('numpy.core.umath', 'sinh',
 
 add_newdoc('numpy.core.umath', 'sqrt',
     """
-    Return the positive square-root of an array, element-wise.
+    Return the non-negative square-root of an array, element-wise.
 
     Parameters
     ----------
diff --git a/numpy/core/einsumfunc.py b/numpy/core/einsumfunc.py
index bb6767c4f..a4c18d482 100644
--- a/numpy/core/einsumfunc.py
+++ b/numpy/core/einsumfunc.py
@@ -1109,7 +1109,7 @@ def einsum(*operands, **kwargs):
             # Checks have already been handled
             input_str, results_index = einsum_str.split('->')
             input_left, input_right = input_str.split(',')
-            if 1 in tmp_operands[0] or 1 in tmp_operands[1]:
+            if 1 in tmp_operands[0].shape or 1 in tmp_operands[1].shape:
                 left_dims = {dim: size for dim, size in
                              zip(input_left, tmp_operands[0].shape)}
                 right_dims = {dim: size for dim, size in
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 75bcedd81..d1aae0aa0 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -140,6 +140,7 @@ def take(a, indices, axis=None, out=None, mode='raise'):
     --------
     compress : Take elements using a boolean mask
     ndarray.take : equivalent method
+    take_along_axis : Take elements by matching the array and the index arrays
 
     Notes
     -----
@@ -478,6 +479,7 @@ def put(a, ind, v, mode='raise'):
     See Also
     --------
     putmask, place
+    put_along_axis : Put elements by matching the array and the index arrays
 
     Examples
     --------
@@ -723,7 +725,9 @@ def argpartition(a, kth, axis=-1, kind='introselect', order=None):
     -------
     index_array : ndarray, int
         Array of indices that partition `a` along the specified axis.
-        In other words, ``a[index_array]`` yields a partitioned `a`.
+        If `a` is one-dimensional, ``a[index_array]`` yields a partitioned `a`.
+        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always
+        yields the partitioned `a`, irrespective of dimensionality.
 
     See Also
     --------
@@ -904,6 +908,8 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
     index_array : ndarray, int
         Array of indices that sort `a` along the specified axis.
         If `a` is one-dimensional, ``a[index_array]`` yields a sorted `a`.
+        More generally, ``np.take_along_axis(a, index_array, axis=a)`` always
+        yields the sorted `a`, irrespective of dimensionality.
 
     See Also
     --------
@@ -1336,10 +1342,11 @@ def diagonal(a, offset=0, axis1=0, axis2=1):
     Returns
     -------
     array_of_diagonals : ndarray
-        If `a` is 2-D and not a `matrix`, a 1-D array of the same type as `a`
-        containing the diagonal is returned. If `a` is a `matrix`, a 1-D
-        array containing the diagonal is returned in order to maintain
-        backward compatibility.
+        If `a` is 2-D, then a 1-D array containing the diagonal and of the
+        same type as `a` is returned unless `a` is a `matrix`, in which case
+        a 1-D array rather than a (2-D) `matrix` is returned in order to
+        maintain backward compatibility.
+        
         If ``a.ndim > 2``, then the dimensions specified by `axis1` and `axis2`
         are removed, and a new axis inserted at the end corresponding to the
         diagonal.
@@ -1496,10 +1503,9 @@ def ravel(a, order='C'):
     Returns
     -------
     y : array_like
-        If `a` is a matrix, y is a 1-D ndarray, otherwise y is an array of
-        the same subtype as `a`. The shape of the returned array is
-        ``(a.size,)``. Matrices are special cased for backward
-        compatibility.
+        y is an array of the same subtype as `a`, with shape ``(a.size,)``.
+        Note that matrices are special cased for backward compatibility, if `a`
+        is a matrix, then y is a 1-D ndarray.
 
     See Also
     --------
diff --git a/numpy/core/include/numpy/ufuncobject.h b/numpy/core/include/numpy/ufuncobject.h
index d0ac1fd7d..4b1b3d325 100644
--- a/numpy/core/include/numpy/ufuncobject.h
+++ b/numpy/core/include/numpy/ufuncobject.h
@@ -167,7 +167,7 @@ typedef struct _tagPyUFuncObject {
         int *core_dim_ixs;
         /*
          * positions of 1st core dimensions of each
-         * argument in core_dim_ixs
+         * argument in core_dim_ixs, equivalent to cumsum(core_num_dims)
          */
         int *core_offsets;
         /* signature string for printing purpose */
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index 1108d4667..7ade3d224 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -489,9 +489,9 @@ def asarray(a, dtype=None, order=None):
 
     Contrary to `asanyarray`, ndarray subclasses are not passed through:
 
-    >>> issubclass(np.matrix, np.ndarray)
+    >>> issubclass(np.recarray, np.ndarray)
     True
-    >>> a = np.matrix([[1, 2]])
+    >>> a = np.array([(1.0, 2), (3.0, 4)], dtype='f4,i4').view(np.recarray)
     >>> np.asarray(a) is a
     False
     >>> np.asanyarray(a) is a
@@ -545,7 +545,7 @@ def asanyarray(a, dtype=None, order=None):
 
     Instances of `ndarray` subclasses are passed through as-is:
 
-    >>> a = np.matrix([1, 2])
+    >>> a = np.array([(1.0, 2), (3.0, 4)], dtype='f4,i4').view(np.recarray)
     >>> np.asanyarray(a) is a
     True
 
@@ -2035,7 +2035,7 @@ def binary_repr(num, width=None):
     '11101'
 
     """
-    def warn_if_insufficient(width, binwdith):
+    def warn_if_insufficient(width, binwidth):
         if width is not None and width < binwidth:
             warnings.warn(
                 "Insufficient bit width provided. This behavior "
@@ -2280,7 +2280,7 @@ def isclose(a, b, rtol=1.e-5, atol=1.e-8, equal_nan=False):
     relative difference (`rtol` * abs(`b`)) and the absolute difference
     `atol` are added together to compare against the absolute difference
     between `a` and `b`.
-    
+
     .. warning:: The default `atol` is not appropriate for comparing numbers
                  that are much smaller than one (see Notes).
 
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 7d8bab557..f826b278f 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -452,17 +452,8 @@ def configuration(parent_package='',top_path=None):
                 moredefs.append(('NPY_RELAXED_STRIDES_DEBUG', 1))
 
             # Get long double representation
-            if sys.platform != 'darwin':
-                rep = check_long_double_representation(config_cmd)
-                if rep in ['INTEL_EXTENDED_12_BYTES_LE',
-                           'INTEL_EXTENDED_16_BYTES_LE',
-                           'MOTOROLA_EXTENDED_12_BYTES_BE',
-                           'IEEE_QUAD_LE', 'IEEE_QUAD_BE',
-                           'IEEE_DOUBLE_LE', 'IEEE_DOUBLE_BE',
-                           'DOUBLE_DOUBLE_BE', 'DOUBLE_DOUBLE_LE']:
-                    moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
-                else:
-                    raise ValueError("Unrecognized long double format: %s" % rep)
+            rep = check_long_double_representation(config_cmd)
+            moredefs.append(('HAVE_LDOUBLE_%s' % rep, 1))
 
             # Py3K check
             if sys.version_info[0] == 3:
diff --git a/numpy/core/setup_common.py b/numpy/core/setup_common.py
index a8aba40bd..450fb1b9d 100644
--- a/numpy/core/setup_common.py
+++ b/numpy/core/setup_common.py
@@ -364,11 +364,16 @@ def long_double_representation(lines):
             # the long double
             if read[-8:] == _AFTER_SEQ:
                 saw = copy.copy(read)
+                # if the content was 12 bytes, we only have 32 - 8 - 12 = 12
+                # "before" bytes. In other words the first 4 "before" bytes went
+                # past the sliding window.
                 if read[:12] == _BEFORE_SEQ[4:]:
                     if read[12:-8] == _INTEL_EXTENDED_12B:
                         return 'INTEL_EXTENDED_12_BYTES_LE'
                     if read[12:-8] == _MOTOROLA_EXTENDED_12B:
                         return 'MOTOROLA_EXTENDED_12_BYTES_BE'
+                # if the content was 16 bytes, we are left with 32-8-16 = 16
+                # "before" bytes, so 8 went past the sliding window.
                 elif read[:8] == _BEFORE_SEQ[8:]:
                     if read[8:-8] == _INTEL_EXTENDED_16B:
                         return 'INTEL_EXTENDED_16_BYTES_LE'
@@ -380,6 +385,7 @@ def long_double_representation(lines):
                         return 'DOUBLE_DOUBLE_BE'
                     elif read[8:-8] == _DOUBLE_DOUBLE_LE:
                         return 'DOUBLE_DOUBLE_LE'
+                # if the content was 8 bytes, left with 32-8-8 = 16 bytes
                 elif read[:16] == _BEFORE_SEQ:
                     if read[16:-8] == _IEEE_DOUBLE_LE:
                         return 'IEEE_DOUBLE_LE'
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index 0299f1a1b..cba96a4c2 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -3,7 +3,9 @@
 #include <Python.h>
 #define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
 #include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
 #include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
 #include "mem_overlap.h"
 #include "npy_extint128.h"
 #include "common.h"
@@ -1828,6 +1830,63 @@ call_npy_@name@@suffix@(PyObject *NPY_UNUSED(self), PyObject *args)
 
 /**end repeat**/
 
+/*
+ * For development/testing purposes, it's convenient to have access to the
+ * system printf for floats. This is a very simple printf interface.
+ */
+PyObject *
+PrintFloat_Printf_g(PyObject *obj, int precision)
+{
+    char str[1024];
+
+    if (PyArray_IsScalar(obj, Half)) {
+        npy_half x = ((PyHalfScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision,
+                      npy_half_to_double(x));
+    }
+    else if (PyArray_IsScalar(obj, Float)) {
+        npy_float x = ((PyFloatScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+    }
+    else if (PyArray_IsScalar(obj, Double)) {
+        npy_double x = ((PyDoubleScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+        /* would be better to use lg, but not available in C90 */
+    }
+    else if (PyArray_IsScalar(obj, LongDouble)) {
+        npy_longdouble x = ((PyLongDoubleScalarObject *)obj)->obval;
+        PyOS_snprintf(str, sizeof(str), "%.*Lg", precision, x);
+    }
+    else{
+        double val = PyFloat_AsDouble(obj);
+        if (error_converting(val)) {
+            return NULL;
+        }
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, val);
+    }
+
+    return PyUString_FromString(str);
+}
+
+
+static PyObject *
+printf_float_g(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    PyObject *obj;
+    int precision;
+
+    if (!PyArg_ParseTuple(args,"Oi:format_float_OSprintf_g", &obj,
+                                                             &precision)) {
+        return NULL;
+    }
+
+    if (precision < 0) {
+        PyErr_SetString(PyExc_TypeError, "precision must be non-negative");
+        return NULL;
+    }
+
+    return PrintFloat_Printf_g(obj, precision);
+}
 
 static PyMethodDef Multiarray_TestsMethods[] = {
     {"IsPythonScalar",
@@ -1967,7 +2026,9 @@ static PyMethodDef Multiarray_TestsMethods[] = {
 /**end repeat1**/
 
 /**end repeat**/
-
+    {"format_float_OSprintf_g",
+        (PyCFunction)printf_float_g,
+        METH_VARARGS , NULL},
     {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index 42f876125..972147bb0 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -182,6 +182,15 @@ npy_strtoull(const char *str, char **endptr, int base)
  *****************************************************************************
  */
 
+#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
+/*
+ * Disable harmless compiler warning "4116: unnamed type definition in
+ * parentheses" which is caused by the _ALIGN macro.
+ */
+#if defined(_MSC_VER)
+#pragma warning(disable:4116)
+#endif
+
 
 /**begin repeat
  *
@@ -246,8 +255,10 @@ static int
         }
         return -1;
     }
-    if (ap == NULL || PyArray_ISBEHAVED(ap))
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, _ALIGN(@type@)));
         *((@type@ *)ov)=temp;
+    }
     else {
         PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
                                        ap);
@@ -746,7 +757,7 @@ NPY_NO_EXPORT int PyArray_CopyObject(PyArrayObject *, PyObject *);
  */
 NPY_NO_EXPORT int
 _setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
-            npy_intp *offset_p)
+            npy_intp *offset_p, char *dstdata)
 {
     PyObject *key;
     PyObject *tup;
@@ -760,7 +771,8 @@ _setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
     }
 
     ((PyArrayObject_fields *)(arr))->descr = new;
-    if ((new->alignment > 1) && ((offset % new->alignment) != 0)) {
+    if ((new->alignment > 1) && 
+                ((((uintptr_t)dstdata + offset) % new->alignment) != 0)) {
         PyArray_CLEARFLAGS(arr, NPY_ARRAY_ALIGNED);
     }
     else {
@@ -788,7 +800,7 @@ _copy_and_return_void_setitem(PyArray_Descr *dstdescr, char *dstdata,
     if (PyArray_EquivTypes(srcdescr, dstdescr)) {
         for (i = 0; i < names_size; i++) {
             /* neither line can ever fail, in principle */
-            if (_setup_field(i, dstdescr, dummy, &offset)) {
+            if (_setup_field(i, dstdescr, dummy, &offset, dstdata)) {
                 return -1;
             }
             PyArray_DESCR(dummy)->f->copyswap(dstdata + offset,
@@ -858,7 +870,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
                 PyObject *item;
 
                 /* temporarily make ap have only this field */
-                if (_setup_field(i, descr, ap, &offset) == -1) {
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
                     failed = 1;
                     break;
                 }
@@ -880,7 +892,7 @@ VOID_setitem(PyObject *op, void *input, void *vap)
 
             for (i = 0; i < names_size; i++) {
                 /* temporarily make ap have only this field */
-                if (_setup_field(i, descr, ap, &offset) == -1) {
+                if (_setup_field(i, descr, ap, &offset, ip) == -1) {
                     failed = 1;
                     break;
                 }
@@ -4193,17 +4205,6 @@ small_correlate(const char * d_, npy_intp dstride,
  *****************************************************************************
  */
 
-
-#define _ALIGN(type) offsetof(struct {char c; type v;}, v)
-/*
- * Disable harmless compiler warning "4116: unnamed type definition in
- * parentheses" which is caused by the _ALIGN macro.
- */
-#if defined(_MSC_VER)
-#pragma warning(disable:4116)
-#endif
-
-
 /**begin repeat
  *
  * #from = VOID, STRING, UNICODE#
diff --git a/numpy/core/src/multiarray/datetime.c b/numpy/core/src/multiarray/datetime.c
index a4a028ad4..af542aecc 100644
--- a/numpy/core/src/multiarray/datetime.c
+++ b/numpy/core/src/multiarray/datetime.c
@@ -2808,9 +2808,12 @@ convert_pyobject_to_timedelta(PyArray_DatetimeMetaData *meta, PyObject *obj,
                 us_meta.base = NPY_FR_m;
             }
             else if (td % (24*60*60*1000000LL) != 0) {
-                us_meta.base = NPY_FR_D;
+                us_meta.base = NPY_FR_h;
             }
             else if (td % (7*24*60*60*1000000LL) != 0) {
+                us_meta.base = NPY_FR_D;
+            }
+            else {
                 us_meta.base = NPY_FR_W;
             }
             us_meta.num = 1;
@@ -3679,11 +3682,11 @@ recursive_find_object_datetime64_type(PyObject *obj,
 
         return 0;
     }
-    /* Python date object -> 'D' */
-    else if (PyDate_Check(obj)) {
+    /* Python datetime object -> 'us' */
+    else if (PyDateTime_Check(obj)) {
         PyArray_DatetimeMetaData tmp_meta;
 
-        tmp_meta.base = NPY_FR_D;
+        tmp_meta.base = NPY_FR_us;
         tmp_meta.num = 1;
 
         /* Combine it with 'meta' */
@@ -3694,11 +3697,11 @@ recursive_find_object_datetime64_type(PyObject *obj,
 
         return 0;
     }
-    /* Python datetime object -> 'us' */
-    else if (PyDateTime_Check(obj)) {
+    /* Python date object -> 'D' */
+    else if (PyDate_Check(obj)) {
         PyArray_DatetimeMetaData tmp_meta;
 
-        tmp_meta.base = NPY_FR_us;
+        tmp_meta.base = NPY_FR_D;
         tmp_meta.num = 1;
 
         /* Combine it with 'meta' */
diff --git a/numpy/core/src/multiarray/dragon4.c b/numpy/core/src/multiarray/dragon4.c
index e005234a0..6706fc495 100644
--- a/numpy/core/src/multiarray/dragon4.c
+++ b/numpy/core/src/multiarray/dragon4.c
@@ -42,6 +42,18 @@
 #define DEBUG_ASSERT(stmnt) do {} while(0)
 #endif
 
+static inline npy_uint64
+bitmask_u64(npy_uint32 n)
+{
+    return ~(~((npy_uint64)0) << n);
+}
+
+static inline npy_uint32
+bitmask_u32(npy_uint32 n)
+{
+    return ~(~((npy_uint32)0) << n);
+}
+
 /*
  *  Get the log base 2 of a 32-bit unsigned integer.
  *  http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup
@@ -139,13 +151,13 @@ BigInt_Copy(BigInt *dst, const BigInt *src)
 static void
 BigInt_Set_uint64(BigInt *i, npy_uint64 val)
 {
-    if (val > 0xFFFFFFFF) {
-        i->blocks[0] = val & 0xFFFFFFFF;
-        i->blocks[1] = (val >> 32) & 0xFFFFFFFF;
+    if (val > bitmask_u64(32)) {
+        i->blocks[0] = val & bitmask_u64(32);
+        i->blocks[1] = (val >> 32) & bitmask_u64(32);
         i->length = 2;
     }
     else if (val != 0) {
-        i->blocks[0] = val & 0xFFFFFFFF;
+        i->blocks[0] = val & bitmask_u64(32);
         i->length = 1;
     }
     else {
@@ -228,7 +240,7 @@ BigInt_Add(BigInt *result, const BigInt *lhs, const BigInt *rhs)
         npy_uint64 sum = carry + (npy_uint64)(*largeCur) +
                                  (npy_uint64)(*smallCur);
         carry = sum >> 32;
-        *resultCur = sum & 0xFFFFFFFF;
+        *resultCur = sum & bitmask_u64(32);
         ++largeCur;
         ++smallCur;
         ++resultCur;
@@ -238,7 +250,7 @@ BigInt_Add(BigInt *result, const BigInt *lhs, const BigInt *rhs)
     while (largeCur != largeEnd) {
         npy_uint64 sum = carry + (npy_uint64)(*largeCur);
         carry = sum >> 32;
-        (*resultCur) = sum & 0xFFFFFFFF;
+        (*resultCur) = sum & bitmask_u64(32);
         ++largeCur;
         ++resultCur;
     }
@@ -307,13 +319,13 @@ BigInt_Multiply(BigInt *result, const BigInt *lhs, const BigInt *rhs)
                 npy_uint64 product = (*resultCur) +
                                      (*largeCur)*(npy_uint64)multiplier + carry;
                 carry = product >> 32;
-                *resultCur = product & 0xFFFFFFFF;
+                *resultCur = product & bitmask_u64(32);
                 ++largeCur;
                 ++resultCur;
             } while(largeCur != large->blocks + large->length);
 
             DEBUG_ASSERT(resultCur < result->blocks + maxResultLen);
-            *resultCur = (npy_uint32)(carry & 0xFFFFFFFF);
+            *resultCur = (npy_uint32)(carry & bitmask_u64(32));
         }
     }
 
@@ -337,7 +349,7 @@ BigInt_Multiply_int(BigInt *result, const BigInt *lhs, npy_uint32 rhs)
     const npy_uint32 *pLhsEnd = lhs->blocks + lhs->length;
     for ( ; pLhsCur != pLhsEnd; ++pLhsCur, ++resultCur) {
         npy_uint64 product = (npy_uint64)(*pLhsCur) * rhs + carry;
-        *resultCur = (npy_uint32)(product & 0xFFFFFFFF);
+        *resultCur = (npy_uint32)(product & bitmask_u64(32));
         carry = product >> 32;
     }
 
@@ -414,7 +426,7 @@ BigInt_Multiply10(BigInt *result)
     npy_uint32 *end = result->blocks + result->length;
     for ( ; cur != end; ++cur) {
         npy_uint64 product = (npy_uint64)(*cur) * 10ull + carry;
-        (*cur) = (npy_uint32)(product & 0xFFFFFFFF);
+        (*cur) = (npy_uint32)(product & bitmask_u64(32));
         carry = product >> 32;
     }
 
@@ -654,7 +666,7 @@ BigInt_Pow10(BigInt *result, npy_uint32 exponent)
      * initialize the result by looking up a 32-bit power of 10 corresponding to
      * the first 3 bits
      */
-    smallExponent = exponent & 0x7;
+    smallExponent = exponent & bitmask_u32(3);
     BigInt_Set_uint32(curTemp, g_PowerOf10_U32[smallExponent]);
 
     /* remove the low bits that we used for the 32-bit lookup table */
@@ -704,7 +716,7 @@ BigInt_MultiplyPow10(BigInt *result, const BigInt *in, npy_uint32 exponent)
      * initialize the result by looking up a 32-bit power of 10 corresponding to
      * the first 3 bits
      */
-    smallExponent = exponent & 0x7;
+    smallExponent = exponent & bitmask_u32(3);
     if (smallExponent != 0) {
         BigInt_Multiply_int(curTemp, in, g_PowerOf10_U32[smallExponent]);
     }
@@ -788,7 +800,7 @@ BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
      */
     DEBUG_ASSERT(!divisor->length == 0 &&
                 divisor->blocks[divisor->length-1] >= 8 &&
-                divisor->blocks[divisor->length-1] < 0xFFFFFFFF &&
+                divisor->blocks[divisor->length-1] < bitmask_u64(32) &&
                 dividend->length <= divisor->length);
 
     /*
@@ -825,10 +837,10 @@ BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
             carry = product >> 32;
 
             difference = (npy_uint64)*dividendCur
-                       - (product & 0xFFFFFFFF) - borrow;
+                       - (product & bitmask_u64(32)) - borrow;
             borrow = (difference >> 32) & 1;
 
-            *dividendCur = difference & 0xFFFFFFFF;
+            *dividendCur = difference & bitmask_u64(32);
 
             ++divisorCur;
             ++dividendCur;
@@ -860,7 +872,7 @@ BigInt_DivideWithRemainder_MaxQuotient9(BigInt *dividend, const BigInt *divisor)
                                   - (npy_uint64)*divisorCur - borrow;
             borrow = (difference >> 32) & 1;
 
-            *dividendCur = difference & 0xFFFFFFFF;
+            *dividendCur = difference & bitmask_u64(32);
 
             ++divisorCur;
             ++dividendCur;
@@ -1442,8 +1454,8 @@ typedef union FloatUnion16
 } FloatUnion16;
 
 npy_bool   IsNegative_F16(FloatUnion16 *v) { return (v->integer >> 15) != 0; }
-npy_uint32 GetExponent_F16(FloatUnion16 *v) { return (v->integer >> 10) & 0x1F;}
-npy_uint32 GetMantissa_F16(FloatUnion16 *v) { return v->integer & 0x3FF; }
+npy_uint32 GetExponent_F16(FloatUnion16 *v) { return (v->integer >> 10) & bitmask_u32(5);}
+npy_uint32 GetMantissa_F16(FloatUnion16 *v) { return v->integer & bitmask_u32(10); }
 
 
 /*
@@ -1459,8 +1471,8 @@ typedef union FloatUnion32
 } FloatUnion32;
 
 npy_bool   IsNegative_F32(FloatUnion32 *v) { return (v->integer >> 31) != 0; }
-npy_uint32 GetExponent_F32(FloatUnion32 *v) { return (v->integer >> 23) & 0xFF;}
-npy_uint32 GetMantissa_F32(FloatUnion32 *v) { return v->integer & 0x7FFFFF; }
+npy_uint32 GetExponent_F32(FloatUnion32 *v) { return (v->integer >> 23) & bitmask_u32(8);}
+npy_uint32 GetMantissa_F32(FloatUnion32 *v) { return v->integer & bitmask_u32(23); }
 
 /*
  * Helper union to decompose a 64-bit IEEE float.
@@ -1474,8 +1486,8 @@ typedef union FloatUnion64
     npy_uint64 integer;
 } FloatUnion64;
 npy_bool   IsNegative_F64(FloatUnion64 *v) { return (v->integer >> 63) != 0; }
-npy_uint32 GetExponent_F64(FloatUnion64 *v) { return (v->integer >> 52) & 0x7FF; }
-npy_uint64 GetMantissa_F64(FloatUnion64 *v) { return v->integer & 0xFFFFFFFFFFFFFull; }
+npy_uint32 GetExponent_F64(FloatUnion64 *v) { return (v->integer >> 52) & bitmask_u32(11); }
+npy_uint64 GetMantissa_F64(FloatUnion64 *v) { return v->integer & bitmask_u64(52); }
 
 /*
  * Helper unions and datatype to decompose a 80-bit IEEE float
@@ -1496,9 +1508,9 @@ typedef struct FloatVal128 {
 npy_bool   IsNegative_F128(FloatVal128 *v) {
     return ((v->integer[1] >> 15) & 0x1) != 0;
 }
-npy_uint32 GetExponent_F128(FloatVal128 *v) { return v->integer[1] & 0x7FFF; }
+npy_uint32 GetExponent_F128(FloatVal128 *v) { return v->integer[1] & bitmask_u32(15); }
 npy_uint64 GetMantissa_F128(FloatVal128 *v) {
-    return v->integer[0] &  0x7FFFFFFFFFFFFFFFull;
+    return v->integer[0] & bitmask_u64(63);
 }
 
 /*
@@ -2151,7 +2163,7 @@ Dragon4_PrintFloat16(char *buffer, npy_uint32 bufferSize, npy_uint16 value,
     }
 
     /* if this is a special value */
-    if (floatExponent == 0x1F) {
+    if (floatExponent == bitmask_u32(5)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 3, signbit);
     }
     /* else this is a number */
@@ -2248,7 +2260,7 @@ Dragon4_PrintFloat32(char *buffer, npy_uint32 bufferSize, npy_float32 value,
     }
 
     /* if this is a special value */
-    if (floatExponent == 0xFF) {
+    if (floatExponent == bitmask_u32(8)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 6, signbit);
     }
     /* else this is a number */
@@ -2346,7 +2358,7 @@ Dragon4_PrintFloat64(char *buffer, npy_uint32 bufferSize, npy_float64 value,
     }
 
     /* if this is a special value */
-    if (floatExponent == 0x7FF) {
+    if (floatExponent == bitmask_u32(11)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 13, signbit);
     }
     /* else this is a number */
@@ -2442,7 +2454,7 @@ Dragon4_PrintFloat128(char *buffer, npy_uint32 bufferSize, FloatVal128 value,
     }
 
     /* if this is a special value */
-    if (floatExponent == 0x7FFF) {
+    if (floatExponent == bitmask_u32(15)) {
         return PrintInfNan(buffer, bufferSize, floatMantissa, 16, signbit);
     }
     /* else this is a number */
diff --git a/numpy/core/src/multiarray/dtype_transfer.c b/numpy/core/src/multiarray/dtype_transfer.c
index 9c27255aa..9f9aa6757 100644
--- a/numpy/core/src/multiarray/dtype_transfer.c
+++ b/numpy/core/src/multiarray/dtype_transfer.c
@@ -3400,6 +3400,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
 {
     npy_intp src_itemsize, dst_itemsize;
     int src_type_num, dst_type_num;
+    int is_builtin;
 
 #if NPY_DT_DBG_TRACING
     printf("Calculating dtype transfer from ");
@@ -3439,6 +3440,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
     dst_itemsize = dst_dtype->elsize;
     src_type_num = src_dtype->type_num;
     dst_type_num = dst_dtype->type_num;
+    is_builtin = src_type_num < NPY_NTYPES && dst_type_num < NPY_NTYPES;
 
     /* Common special case - number -> number NBO cast */
     if (PyTypeNum_ISNUMBER(src_type_num) &&
@@ -3462,13 +3464,14 @@ PyArray_GetDTypeTransferFunction(int aligned,
     }
 
     /*
-     * If there are no references and the data types are equivalent,
+     * If there are no references and the data types are equivalent and builtin,
      * return a simple copy
      */
     if (PyArray_EquivTypes(src_dtype, dst_dtype) &&
             !PyDataType_REFCHK(src_dtype) && !PyDataType_REFCHK(dst_dtype) &&
             ( !PyDataType_HASFIELDS(dst_dtype) ||
-              is_dtype_struct_simple_unaligned_layout(dst_dtype)) ) {
+              is_dtype_struct_simple_unaligned_layout(dst_dtype)) &&
+            is_builtin) {
         /*
          * We can't pass through the aligned flag because it's not
          * appropriate. Consider a size-8 string, it will say it's
@@ -3494,7 +3497,7 @@ PyArray_GetDTypeTransferFunction(int aligned,
                 !PyDataType_HASSUBARRAY(dst_dtype) &&
                 src_type_num != NPY_DATETIME && src_type_num != NPY_TIMEDELTA) {
         /* A custom data type requires that we use its copy/swap */
-        if (src_type_num >= NPY_NTYPES || dst_type_num >= NPY_NTYPES) {
+        if (!is_builtin) {
             /*
              * If the sizes and kinds are identical, but they're different
              * custom types, then get a cast function
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 470a5fff9..0eab25299 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -1776,138 +1776,94 @@ get_sum_of_products_function(int nop, int type_num,
     return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
 }
 
+
 /*
- * Parses the subscripts for one operand into an output
- * of 'ndim' labels
+ * Parses the subscripts for one operand into an output of 'ndim'
+ * labels. The resulting 'op_labels' array will have:
+ *  - the ASCII code of the label for the first occurrence of a label;
+ *  - the (negative) offset to the first occurrence of the label for
+ *    repeated labels;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ * For example:
+ *  - subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]
+ *  - subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]
  */
+
 static int
 parse_operand_subscripts(char *subscripts, int length,
-                        int ndim,
-                        int iop, char *out_labels,
-                        char *out_label_counts,
-                        int *out_min_label,
-                        int *out_max_label,
-                        int *out_num_labels)
+                         int ndim, int iop, char *op_labels,
+                         char *label_counts, int *min_label, int *max_label)
 {
-    int i, idim, ndim_left, label;
-    int ellipsis = 0;
+    int i;
+    int idim = 0;
+    int ellipsis = -1;
 
-    /* Process the labels from the end until the ellipsis */
-    idim = ndim-1;
-    for (i = length-1; i >= 0; --i) {
-        label = subscripts[i];
-        /* A label for an axis */
+    /* Process all labels for this operand */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
         if (label > 0 && isalpha(label)) {
-            if (idim >= 0) {
-                out_labels[idim--] = label;
-                /* Calculate the min and max labels */
-                if (label < *out_min_label) {
-                    *out_min_label = label;
-                }
-                if (label > *out_max_label) {
-                    *out_max_label = label;
-                }
-                /* If it's the first time we see this label, count it */
-                if (out_label_counts[label] == 0) {
-                    (*out_num_labels)++;
-                }
-                out_label_counts[label]++;
-            }
-            else {
+            /* Check we don't exceed the operator dimensions. */
+            if (idim >= ndim) {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains "
-                            "too many subscripts for operand %d", iop);
-                return 0;
+                             "einstein sum subscripts string contains "
+                             "too many subscripts for operand %d", iop);
+                return -1;
+            }
+
+            op_labels[idim++] = label;
+            if (label < *min_label) {
+                *min_label = label;
             }
+            if (label > *max_label) {
+                *max_label = label;
+            }
+            label_counts[label]++;
         }
-        /* The end of the ellipsis */
+        /* The beginning of the ellipsis. */
         else if (label == '.') {
-            /* A valid ellipsis */
-            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
-                ellipsis = 1;
-                length = i-2;
-                break;
-            }
-            else {
+            /* Check it's a proper ellipsis. */
+            if (ellipsis != -1 || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') in "
-                            "operand %d", iop);
-                return 0;
-
+                             "einstein sum subscripts string contains a "
+                             "'.' that is not part of an ellipsis ('...') "
+                             "in operand %d", iop);
+                return -1;
             }
+
+            ellipsis = idim;
         }
         else if (label != ' ') {
             PyErr_Format(PyExc_ValueError,
-                        "invalid subscript '%c' in einstein sum "
-                        "subscripts string, subscripts must "
-                        "be letters", (char)label);
-            return 0;
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
         }
     }
 
-    if (!ellipsis && idim != -1) {
-        PyErr_Format(PyExc_ValueError,
-                    "operand has more dimensions than subscripts "
-                    "given in einstein sum, but no '...' ellipsis "
-                    "provided to broadcast the extra dimensions.");
-        return 0;
-    }
-
-    /* Reduce ndim to just the dimensions left to fill at the beginning */
-    ndim_left = idim+1;
-    idim = 0;
-
-    /*
-     * If we stopped because of an ellipsis, start again from the beginning.
-     * The length was truncated to end at the ellipsis in this case.
-     */
-    if (i > 0) {
-        for (i = 0; i < length; ++i) {
-            label = subscripts[i];
-            /* A label for an axis */
-            if (label > 0 && isalnum(label)) {
-                if (idim < ndim_left) {
-                    out_labels[idim++] = label;
-                    /* Calculate the min and max labels */
-                    if (label < *out_min_label) {
-                        *out_min_label = label;
-                    }
-                    if (label > *out_max_label) {
-                        *out_max_label = label;
-                    }
-                    /* If it's the first time we see this label, count it */
-                    if (out_label_counts[label] == 0) {
-                        (*out_num_labels)++;
-                    }
-                    out_label_counts[label]++;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                "einstein sum subscripts string contains "
-                                "too many subscripts for operand %d", iop);
-                    return 0;
-                }
-            }
-            else if (label == '.') {
-                PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') in "
-                            "operand %d", iop);
-            }
-            else if (label != ' ') {
-                PyErr_Format(PyExc_ValueError,
-                            "invalid subscript '%c' in einstein sum "
-                            "subscripts string, subscripts must "
-                            "be letters", (char)label);
-                return 0;
-            }
+    /* No ellipsis found, labels must match dimensions exactly. */
+    if (ellipsis == -1) {
+        if (idim != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                         "operand has more dimensions than subscripts "
+                         "given in einstein sum, but no '...' ellipsis "
+                         "provided to broadcast the extra dimensions.");
+            return -1;
         }
     }
-
-    /* Set the remaining labels to 0 */
-    while (idim < ndim_left) {
-        out_labels[idim++] = 0;
+    /* Ellipsis found, may have to add broadcast dimensions. */
+    else if (idim < ndim) {
+        /* Move labels after ellipsis to the end. */
+        for (i = 0; i < idim - ellipsis; ++i) {
+            op_labels[ndim - i - 1] = op_labels[idim - i - 1];
+        }
+        /* Set all broadcast dimensions to zero. */
+        for (i = 0; i < ndim - idim; ++i) {
+            op_labels[ellipsis + i] = 0;
+        }
     }
 
     /*
@@ -1918,158 +1874,116 @@ parse_operand_subscripts(char *subscripts, int length,
      * twos complement arithmetic the char is ok either way here, and
      * later where it matters the char is cast to a signed char.
      */
-    for (idim = 0; idim  < ndim-1; ++idim) {
-        char *next;
-        /* If this is a proper label, find any duplicates of it */
-        label = out_labels[idim];
+    for (idim = 0; idim < ndim - 1; ++idim) {
+        int label = op_labels[idim];
+        /* If it is a proper label, find any duplicates of it. */
         if (label > 0) {
-            /* Search for the next matching label */
-            next = (char *)memchr(out_labels+idim+1, label,
-                                    ndim-idim-1);
+            /* Search for the next matching label. */
+            char *next = memchr(op_labels + idim + 1, label, ndim - idim - 1);
+
             while (next != NULL) {
-                /* The offset from next to out_labels[idim] (negative) */
-                *next = (char)((out_labels+idim)-next);
-                /* Search for the next matching label */
-                next = (char *)memchr(next+1, label,
-                                        out_labels+ndim-1-next);
+                /* The offset from next to op_labels[idim] (negative). */
+                *next = (char)((op_labels + idim) - next);
+                /* Search for the next matching label. */
+                next = memchr(next + 1, label, op_labels + ndim - 1 - next);
             }
         }
     }
 
-    return 1;
+    return 0;
 }
 
+
 /*
- * Parses the subscripts for the output operand into an output
- * that requires 'ndim_broadcast' unlabeled dimensions, returning
- * the number of output dimensions.  Returns -1 if there is an error.
+ * Parses the subscripts for the output operand into an output that
+ * includes 'ndim_broadcast' unlabeled dimensions, and returns the total
+ * number of output dimensions, or -1 if there is an error. Similarly
+ * to parse_operand_subscripts, the 'out_labels' array will have, for
+ * each dimension:
+ *  - the ASCII code of the corresponding label;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
  */
 static int
 parse_output_subscripts(char *subscripts, int length,
                         int ndim_broadcast,
-                        const char *label_counts,
-                        char *out_labels)
+                        const char *label_counts, char *out_labels)
 {
-    int i, nlabels, label, idim, ndim, ndim_left;
+    int i, bdim;
+    int ndim = 0;
     int ellipsis = 0;
 
-    /* Count the labels, making sure they're all unique and valid */
-    nlabels = 0;
+    /* Process all the output labels. */
     for (i = 0; i < length; ++i) {
-        label = subscripts[i];
-        if (label > 0 && isalpha(label)) {
-            /* Check if it occurs again */
-            if (memchr(subscripts+i+1, label, length-i-1) == NULL) {
-                /* Check that it was used in the inputs */
-                if (label_counts[label] == 0) {
-                    PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string included "
-                            "output subscript '%c' which never appeared "
-                            "in an input", (char)label);
-                    return -1;
-                }
+        int label = subscripts[i];
 
-                nlabels++;
-            }
-            else {
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check that it doesn't occur again. */
+            if (memchr(subscripts + i + 1, label, length - i - 1) != NULL) {
                 PyErr_Format(PyExc_ValueError,
-                        "einstein sum subscripts string includes "
-                        "output subscript '%c' multiple times",
-                        (char)label);
+                             "einstein sum subscripts string includes "
+                             "output subscript '%c' multiple times",
+                             (char)label);
                 return -1;
             }
-        }
-        else if (label != '.' && label != ' ') {
-            PyErr_Format(PyExc_ValueError,
-                        "invalid subscript '%c' in einstein sum "
-                        "subscripts string, subscripts must "
-                        "be letters", (char)label);
-            return -1;
-        }
-    }
-
-    /* The number of output dimensions */
-    ndim = ndim_broadcast + nlabels;
-
-    /* Process the labels from the end until the ellipsis */
-    idim = ndim-1;
-    for (i = length-1; i >= 0; --i) {
-        label = subscripts[i];
-        /* A label for an axis */
-        if (label != '.' && label != ' ') {
-            if (idim >= 0) {
-                out_labels[idim--] = label;
+            /* Check that it was used in the inputs. */
+            if (label_counts[label] == 0) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string included "
+                             "output subscript '%c' which never appeared "
+                             "in an input", (char)label);
+                return -1;
             }
-            else {
+            /* Check that there is room in out_labels for this label. */
+            if (ndim >= NPY_MAXDIMS) {
                 PyErr_Format(PyExc_ValueError,
-                            "einstein sum subscripts string contains "
-                            "too many output subscripts");
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
                 return -1;
             }
+
+            out_labels[ndim++] = label;
         }
-        /* The end of the ellipsis */
+        /* The beginning of the ellipsis. */
         else if (label == '.') {
-            /* A valid ellipsis */
-            if (i >= 2 && subscripts[i-1] == '.' && subscripts[i-2] == '.') {
-                ellipsis = 1;
-                length = i-2;
-                break;
-            }
-            else {
+            /* Check it is a proper ellipsis. */
+            if (ellipsis || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
                 PyErr_SetString(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') "
-                            "in the output");
+                                "einstein sum subscripts string "
+                                "contains a '.' that is not part of "
+                                "an ellipsis ('...') in the output");
                 return -1;
-
             }
-        }
-    }
-
-    if (!ellipsis && idim != -1) {
-        PyErr_SetString(PyExc_ValueError,
-                    "output has more dimensions than subscripts "
-                    "given in einstein sum, but no '...' ellipsis "
-                    "provided to broadcast the extra dimensions.");
-        return 0;
-    }
-
-    /* Reduce ndim to just the dimensions left to fill at the beginning */
-    ndim_left = idim+1;
-    idim = 0;
-
-    /*
-     * If we stopped because of an ellipsis, start again from the beginning.
-     * The length was truncated to end at the ellipsis in this case.
-     */
-    if (i > 0) {
-        for (i = 0; i < length; ++i) {
-            label = subscripts[i];
-            if (label == '.') {
-                PyErr_SetString(PyExc_ValueError,
-                            "einstein sum subscripts string contains a "
-                            "'.' that is not part of an ellipsis ('...') "
-                            "in the output");
+            /* Check there is room in out_labels for broadcast dims. */
+            if (ndim + ndim_broadcast > NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
                 return -1;
             }
-            /* A label for an axis */
-            else if (label != ' ') {
-                if (idim < ndim_left) {
-                    out_labels[idim++] = label;
-                }
-                else {
-                    PyErr_Format(PyExc_ValueError,
-                                "einstein sum subscripts string contains "
-                                "too many subscripts for the output");
-                    return -1;
-                }
+
+            ellipsis = 1;
+            for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
+                out_labels[ndim++] = 0;
             }
         }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
     }
 
-    /* Set the remaining output labels to 0 */
-    while (idim < ndim_left) {
-        out_labels[idim++] = 0;
+    /* If no ellipsis was found there should be no broadcast dimensions. */
+    if (!ellipsis && ndim_broadcast > 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "output has more dimensions than subscripts "
+                        "given in einstein sum, but no '...' ellipsis "
+                        "provided to broadcast the extra dimensions.");
+        return -1;
     }
 
     return ndim;
@@ -2121,7 +2035,7 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
             if (ibroadcast == ndim_output) {
                 PyErr_SetString(PyExc_ValueError,
                         "output had too few broadcast dimensions");
-                return 0;
+                return -1;
             }
             new_dims[ibroadcast] = PyArray_DIM(op, idim);
             new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
@@ -2144,7 +2058,7 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
                         "index '%c' don't match (%d != %d)",
                         iop, label, (int)new_dims[i],
                         (int)PyArray_DIM(op, idim));
-                return 0;
+                return -1;
             }
             new_dims[i] = PyArray_DIM(op, idim);
             new_strides[i] += PyArray_STRIDE(op, idim);
@@ -2162,14 +2076,14 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
                                 (PyObject *)op);
 
         if (*ret == NULL) {
-            return 0;
+            return -1;
         }
         if (!PyArray_Check(*ret)) {
             Py_DECREF(*ret);
             *ret = NULL;
             PyErr_SetString(PyExc_RuntimeError,
                         "NewFromDescr failed to return an array");
-            return 0;
+            return -1;
         }
         PyArray_UpdateFlags(*ret,
                     NPY_ARRAY_C_CONTIGUOUS|
@@ -2179,14 +2093,14 @@ get_single_op_view(PyArrayObject *op, int  iop, char *labels,
         if (PyArray_SetBaseObject(*ret, (PyObject *)op) < 0) {
             Py_DECREF(*ret);
             *ret = NULL;
-            return 0;
+            return -1;
         }
-        return 1;
+        return 0;
     }
 
     /* Return success, but that we couldn't make a view */
     *ret = NULL;
-    return 1;
+    return 0;
 }
 
 static PyArrayObject *
@@ -2332,7 +2246,7 @@ prepare_op_axes(int ndim, int iop, char *labels, int *axes,
         }
     }
 
-    return 1;
+    return 0;
 }
 
 static int
@@ -2613,7 +2527,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                     NPY_ORDER order, NPY_CASTING casting,
                     PyArrayObject *out)
 {
-    int iop, label, min_label = 127, max_label = 0, num_labels;
+    int iop, label, min_label = 127, max_label = 0;
     char label_counts[128];
     char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
     char output_labels[NPY_MAXDIMS], *iter_labels;
@@ -2644,7 +2558,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
 
     /* Parse the subscripts string into label_counts and op_labels */
     memset(label_counts, 0, sizeof(label_counts));
-    num_labels = 0;
     for (iop = 0; iop < nop; ++iop) {
         int length = (int)strcspn(subscripts, ",-");
 
@@ -2661,10 +2574,10 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
             return NULL;
         }
 
-        if (!parse_operand_subscripts(subscripts, length,
+        if (parse_operand_subscripts(subscripts, length,
                         PyArray_NDIM(op_in[iop]),
                         iop, op_labels[iop], label_counts,
-                        &min_label, &max_label, &num_labels)) {
+                        &min_label, &max_label) < 0) {
             return NULL;
         }
 
@@ -2698,21 +2611,18 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     }
 
     /*
-     * If there is no output signature, create one using each label
-     * that appeared once, in alphabetical order
+     * If there is no output signature, fill output_labels and ndim_output
+     * using each label that appeared once, in alphabetical order.
      */
     if (subscripts[0] == '\0') {
-        char outsubscripts[NPY_MAXDIMS + 3];
-        int length;
-        /* If no output was specified, always broadcast left (like normal) */
-        outsubscripts[0] = '.';
-        outsubscripts[1] = '.';
-        outsubscripts[2] = '.';
-        length = 3;
+        /* If no output was specified, always broadcast left, as usual. */
+        for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
+            output_labels[ndim_output] = 0;
+        }
         for (label = min_label; label <= max_label; ++label) {
             if (label_counts[label] == 1) {
-                if (length < NPY_MAXDIMS-1) {
-                    outsubscripts[length++] = label;
+                if (ndim_output < NPY_MAXDIMS) {
+                    output_labels[ndim_output++] = label;
                 }
                 else {
                     PyErr_SetString(PyExc_ValueError,
@@ -2722,10 +2632,6 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
                 }
             }
         }
-        /* Parse the output subscript string */
-        ndim_output = parse_output_subscripts(outsubscripts, length,
-                                        ndim_broadcast, label_counts,
-                                        output_labels);
     }
     else {
         if (subscripts[0] != '-' || subscripts[1] != '>') {
@@ -2736,13 +2642,13 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         }
         subscripts += 2;
 
-        /* Parse the output subscript string */
+        /* Parse the output subscript string. */
         ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
                                         ndim_broadcast, label_counts,
                                         output_labels);
-    }
-    if (ndim_output < 0) {
-        return NULL;
+        if (ndim_output < 0) {
+            return NULL;
+        }
     }
 
     if (out != NULL && PyArray_NDIM(out) != ndim_output) {
@@ -2776,9 +2682,9 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         if (iop == 0 && nop == 1 && out == NULL) {
             ret = NULL;
 
-            if (!get_single_op_view(op_in[iop], iop, labels,
-                                    ndim_output, output_labels,
-                                    &ret)) {
+            if (get_single_op_view(op_in[iop], iop, labels,
+                                   ndim_output, output_labels,
+                                   &ret) < 0) {
                 return NULL;
             }
 
@@ -2840,8 +2746,8 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     for (iop = 0; iop < nop; ++iop) {
         op_axes[iop] = op_axes_arrays[iop];
 
-        if (!prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
-                    op_axes[iop], ndim_iter, iter_labels)) {
+        if (prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels) < 0) {
             goto fail;
         }
     }
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 4b2c6aa5a..42dbc3cce 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -293,8 +293,7 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
         if (commit_to_unpack) {
             /* propagate errors */
             if (tmp_obj == NULL) {
-                multi_DECREF(result, i);
-                return -1;
+                goto fail;
             }
         }
         else {
@@ -313,6 +312,16 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
                     || PySlice_Check(tmp_obj)
                     || tmp_obj == Py_Ellipsis
                     || tmp_obj == Py_None) {
+                if (DEPRECATE_FUTUREWARNING(
+                        "Using a non-tuple sequence for multidimensional "
+                        "indexing is deprecated; use `arr[tuple(seq)]` "
+                        "instead of `arr[seq]`. In the future this will be "
+                        "interpreted as an array index, `arr[np.array(seq)]`, "
+                        "which will result either in an error or a different "
+                        "result.") < 0) {
+                    i++;  /* since loop update doesn't run */
+                    goto fail;
+                }
                 commit_to_unpack = 1;
             }
         }
@@ -328,6 +337,10 @@ unpack_indices(PyObject *index, PyObject **result, npy_intp result_n)
         multi_DECREF(result, i);
         return unpack_scalar(index, result, result_n);
     }
+
+fail:
+    multi_DECREF(result, i);
+    return -1;
 }
 
 /**
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 6d323dbd8..896a3b07e 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -2652,21 +2652,31 @@ einsum_list_to_subscripts(PyObject *obj, char *subscripts, int subsize)
         /* Subscript */
         else if (PyInt_Check(item) || PyLong_Check(item)) {
             long s = PyInt_AsLong(item);
-            if ( s < 0 || s > 2*26) {
+            npy_bool bad_input = 0;
+
+            if (subindex + 1 >= subsize) {
                 PyErr_SetString(PyExc_ValueError,
-                        "subscript is not within the valid range [0, 52]");
+                        "subscripts list is too long");
                 Py_DECREF(obj);
                 return -1;
             }
-            if (s < 26) {
-                subscripts[subindex++] = 'A' + s;
+
+            if ( s < 0 ) {
+                bad_input = 1;
+            }
+            else if (s < 26) {
+                subscripts[subindex++] = 'A' + (char)s;
+            }
+            else if (s < 2*26) {
+                subscripts[subindex++] = 'a' + (char)s - 26;
             }
             else {
-                subscripts[subindex++] = 'a' + s;
+                bad_input = 1;
             }
-            if (subindex >= subsize) {
+
+            if (bad_input) {
                 PyErr_SetString(PyExc_ValueError,
-                        "subscripts list is too long");
+                        "subscript is not within the valid range [0, 52)");
                 Py_DECREF(obj);
                 return -1;
             }
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 915d743c8..14389a925 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -476,7 +476,9 @@ fast_scalar_power(PyArrayObject *a1, PyObject *o2, int inplace,
     double exponent;
     NPY_SCALARKIND kind;   /* NPY_NOSCALAR is not scalar */
 
-    if (PyArray_Check(a1) && ((kind=is_scalar_with_conversion(o2, &exponent))>0)) {
+    if (PyArray_Check(a1) &&
+            !PyArray_ISOBJECT(a1) &&
+            ((kind=is_scalar_with_conversion(o2, &exponent))>0)) {
         PyObject *fastop = NULL;
         if (PyArray_ISFLOAT(a1) || PyArray_ISCOMPLEX(a1)) {
             if (exponent == 1.0) {
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index cb4af0d12..6dc7e5a3e 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -4201,7 +4201,7 @@ doubletype_print(PyObject *o, FILE *fp, int flags)
         return -1;
     }
 
-    ret = PyObject_Print(to_print, fp, flags);
+    ret = PyObject_Print(to_print, fp, Py_PRINT_RAW);
     Py_DECREF(to_print);
     return ret;
 }
diff --git a/numpy/core/src/npymath/ieee754.c.src b/numpy/core/src/npymath/ieee754.c.src
index 5405c8fe3..fcfd15ab4 100644
--- a/numpy/core/src/npymath/ieee754.c.src
+++ b/numpy/core/src/npymath/ieee754.c.src
@@ -184,6 +184,7 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
 {
     npy_int64 hx,ihx,ilx;
     npy_uint64 lx;
+    npy_longdouble u;
 
     GET_LDOUBLE_WORDS64(hx, lx, x);
     ihx = hx & 0x7fffffffffffffffLL;      /* |hx| */
@@ -194,7 +195,6 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
         return x; /* signal the nan */
     }
     if(ihx == 0 && ilx == 0) {          /* x == 0 */
-        npy_longdouble u;
         SET_LDOUBLE_WORDS64(x, p, 0ULL);/* return +-minsubnormal */
         u = x * x;
         if (u == x) {
@@ -204,7 +204,6 @@ static npy_longdouble _nextl(npy_longdouble x, int p)
         }
     }
 
-    npy_longdouble u;
     if(p < 0) { /* p < 0, x -= ulp */
         if((hx==0xffefffffffffffffLL)&&(lx==0xfc8ffffffffffffeLL))
             return x+x; /* overflow, return -inf */
diff --git a/numpy/core/src/npymath/npy_math_private.h b/numpy/core/src/npymath/npy_math_private.h
index d75b9e991..7ee564239 100644
--- a/numpy/core/src/npymath/npy_math_private.h
+++ b/numpy/core/src/npymath/npy_math_private.h
@@ -287,8 +287,7 @@ do {                                                            \
     typedef npy_uint32 ldouble_man_t;
     typedef npy_uint32 ldouble_exp_t;
     typedef npy_uint32 ldouble_sign_t;
-#elif defined(HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_BE) || \
-      defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE)
+#elif defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE)
     /* 64 bits IEEE double precision aligned on 16 bytes: used by ppc arch on
      * Mac OS X */
 
@@ -477,7 +476,7 @@ do {                                                            \
      ((x).a[LDBL_MANH_INDEX] & ~LDBL_MANH_MASK) |                       \
      (((IEEEl2bitsrep_part)(v) << LDBL_MANH_SHIFT) & LDBL_MANH_MASK))
 
-#endif /* #ifndef HAVE_LDOUBLE_DOUBLE_DOUBLE_BE */
+#endif /* !HAVE_LDOUBLE_DOUBLE_DOUBLE_* */
 
 /*
  * Those unions are used to convert a pointer of npy_cdouble to native C99
diff --git a/numpy/core/src/private/npy_fpmath.h b/numpy/core/src/private/npy_fpmath.h
index 86b9cf3da..e1521de3b 100644
--- a/numpy/core/src/private/npy_fpmath.h
+++ b/numpy/core/src/private/npy_fpmath.h
@@ -7,39 +7,10 @@
 #include "numpy/npy_cpu.h"
 #include "numpy/npy_common.h"
 
-#ifdef NPY_OS_DARWIN
-    /* This hardcoded logic is fragile, but universal builds makes it
-     * difficult to detect arch-specific features */
-
-    /* MAC OS X < 10.4 and gcc < 4 does not support proper long double, and
-     * is the same as double on those platforms */
-    #if NPY_BITSOF_LONGDOUBLE == NPY_BITSOF_DOUBLE
-        /* This assumes that FPU and ALU have the same endianness */
-        #if NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_LE
-        #elif NPY_BYTE_ORDER == NPY_BIG_ENDIAN
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_BE
-        #else
-            #error Endianness undefined ?
-        #endif
-    #else
-        #if defined(NPY_CPU_X86)
-            #define HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE
-        #elif defined(NPY_CPU_AMD64)
-            #define HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE
-        #elif defined(NPY_CPU_PPC) || defined(NPY_CPU_PPC64)
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_BE
-        #elif defined(NPY_CPU_PPC64LE)
-            #define HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_LE
-        #endif
-    #endif
-#endif
-
 #if !(defined(HAVE_LDOUBLE_IEEE_QUAD_BE) || \
       defined(HAVE_LDOUBLE_IEEE_QUAD_LE) || \
       defined(HAVE_LDOUBLE_IEEE_DOUBLE_LE) || \
       defined(HAVE_LDOUBLE_IEEE_DOUBLE_BE) || \
-      defined(HAVE_LDOUBLE_IEEE_DOUBLE_16_BYTES_BE) || \
       defined(HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE) || \
       defined(HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE) || \
       defined(HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE) || \
diff --git a/numpy/core/src/umath/_umath_tests.c.src b/numpy/core/src/umath/_umath_tests.c.src
index 120ce0332..2a74c1aaa 100644
--- a/numpy/core/src/umath/_umath_tests.c.src
+++ b/numpy/core/src/umath/_umath_tests.c.src
@@ -271,7 +271,7 @@ static char euclidean_pdist_signatures[] = { NPY_FLOAT, NPY_FLOAT,
                                              NPY_DOUBLE, NPY_DOUBLE };
 
 
-static void
+static int
 addUfuncs(PyObject *dictionary) {
     PyObject *f;
 
@@ -280,6 +280,13 @@ addUfuncs(PyObject *dictionary) {
                     "inner on the last dimension and broadcast on the rest \n"
                     "     \"(i),(i)->()\" \n",
                     0, inner1d_signature);
+    /*
+     * yes, this should not happen, but I (MHvK) just spent an hour looking at
+     * segfaults because I screwed up something that seemed totally unrelated.
+     */
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "inner1d", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(innerwt_functions, innerwt_data,
@@ -287,6 +294,9 @@ addUfuncs(PyObject *dictionary) {
                     "inner1d with a weight argument \n"
                     "     \"(i),(i),(i)->()\" \n",
                     0, innerwt_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "innerwt", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(matrix_multiply_functions,
@@ -295,6 +305,9 @@ addUfuncs(PyObject *dictionary) {
                     "matrix multiplication on last two dimensions \n"
                     "     \"(m,n),(n,p)->(m,p)\" \n",
                     0, matrix_multiply_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "matrix_multiply", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(euclidean_pdist_functions,
@@ -303,27 +316,38 @@ addUfuncs(PyObject *dictionary) {
                     "pairwise euclidean distance on last two dimensions \n"
                     "     \"(n,d)->(p)\" \n",
                     0, euclidean_pdist_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "euclidean_pdist", f);
     Py_DECREF(f);
     f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
                     inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d_no_doc",
                     NULL,
                     0, inner1d_signature);
+    if (f == NULL) {
+        return -1;
+    }
     PyDict_SetItemString(dictionary, "inner1d_no_doc", f);
     Py_DECREF(f);
+
+    return 0;
 }
 
 
 static PyObject *
 UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
 {
-    int nin, nout;
+    int nin, nout, i;
     PyObject *signature, *sig_str;
-    PyObject *f;
+    PyUFuncObject *f = NULL;
+    PyObject *core_num_dims = NULL, *core_dim_ixs = NULL;
     int core_enabled;
+    int core_num_ixs = 0;
 
-    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) return NULL;
-
+    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) {
+        return NULL;
+    }
 
     if (PyString_Check(signature)) {
         sig_str = signature;
@@ -334,17 +358,60 @@ UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
         return NULL;
     }
 
-    f = PyUFunc_FromFuncAndDataAndSignature(NULL, NULL, NULL,
+    f = (PyUFuncObject*)PyUFunc_FromFuncAndDataAndSignature(
+        NULL, NULL, NULL,
         0, nin, nout, PyUFunc_None, "no name",
         "doc:none",
         1, PyString_AS_STRING(sig_str));
     if (sig_str != signature) {
         Py_DECREF(sig_str);
     }
-    if (f == NULL) return NULL;
-    core_enabled = ((PyUFuncObject*)f)->core_enabled;
+    if (f == NULL) {
+        return NULL;
+    }
+    core_enabled = f->core_enabled;
+    /*
+     * Don't presume core_num_dims and core_dim_ixs are defined;
+     * they currently are even if core_enabled=0, but there's no real
+     * reason they should be.  So avoid segfaults if we change our mind.
+     */
+    if (f->core_num_dims != NULL) {
+        core_num_dims = PyTuple_New(f->nargs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->nargs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_num_dims[i]);
+            PyTuple_SET_ITEM(core_num_dims, i, val);
+            core_num_ixs += f->core_num_dims[i];
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_num_dims = Py_None;
+    }
+    if (f->core_dim_ixs != NULL) {
+        core_dim_ixs = PyTuple_New(core_num_ixs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < core_num_ixs; i++) {
+            PyObject * val = PyLong_FromLong(f->core_dim_ixs[i]);
+            PyTuple_SET_ITEM(core_dim_ixs, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_ixs = Py_None;
+    }
     Py_DECREF(f);
-    return Py_BuildValue("i", core_enabled);
+    return Py_BuildValue("iOO", core_enabled, core_num_dims, core_dim_ixs);
+
+fail:
+    Py_XDECREF(f);
+    Py_XDECREF(core_num_dims);
+    Py_XDECREF(core_dim_ixs);
+    return NULL;
 }
 
 static PyMethodDef UMath_TestsMethods[] = {
@@ -371,15 +438,14 @@ static struct PyModuleDef moduledef = {
 };
 #endif
 
+/* Initialization function for the module */
 #if defined(NPY_PY3K)
-#define RETVAL m
-PyMODINIT_FUNC PyInit__umath_tests(void)
+#define RETVAL(x) x
+PyMODINIT_FUNC PyInit__umath_tests(void) {
 #else
-#define RETVAL
-PyMODINIT_FUNC
-init_umath_tests(void)
+#define RETVAL(x)
+PyMODINIT_FUNC init_umath_tests(void) {
 #endif
-{
     PyObject *m;
     PyObject *d;
     PyObject *version;
@@ -389,9 +455,9 @@ init_umath_tests(void)
 #else
     m = Py_InitModule("_umath_tests", UMath_TestsMethods);
 #endif
-    if (m == NULL)
-        return RETVAL;
-
+    if (m == NULL) {
+        return RETVAL(NULL);
+    }
     import_array();
     import_ufunc();
 
@@ -402,12 +468,13 @@ init_umath_tests(void)
     Py_DECREF(version);
 
     /* Load the ufunc operators into the module's namespace */
-    addUfuncs(d);
-
-    if (PyErr_Occurred()) {
+    if (addUfuncs(d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
         PyErr_SetString(PyExc_RuntimeError,
                         "cannot load _umath_tests module.");
+        return RETVAL(NULL);
     }
 
-    return RETVAL;
+    return RETVAL(m);
 }
diff --git a/numpy/core/src/umath/override.c b/numpy/core/src/umath/override.c
index 123d9af87..c298fe315 100644
--- a/numpy/core/src/umath/override.c
+++ b/numpy/core/src/umath/override.c
@@ -29,7 +29,10 @@ normalize_signature_keyword(PyObject *normal_kwds)
                             "cannot specify both 'sig' and 'signature'");
             return -1;
         }
-        Py_INCREF(obj);
+        /*
+         * No INCREF or DECREF needed: got a borrowed reference above,
+         * and, unlike e.g. PyList_SetItem, PyDict_SetItem INCREF's it.
+         */
         PyDict_SetItemString(normal_kwds, "signature", obj);
         PyDict_DelItemString(normal_kwds, "sig");
     }
@@ -291,7 +294,6 @@ normalize_outer_args(PyUFuncObject *ufunc, PyObject *args,
     if (*normal_args == NULL) {
         return -1;
     }
-
     /* ufuncs accept 'sig' or 'signature' normalize to 'signature' */
     return normalize_signature_keyword(*normal_kwds);
 }
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 84a329475..aacf3f780 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -568,7 +568,8 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                     PyObject **out_typetup,
                     int *out_subok,
                     PyArrayObject **out_wheremask,
-                    PyObject **out_axes)
+                    PyObject **out_axes,
+                    int *out_keepdims)
 {
     int i, nargs;
     int nin = ufunc->nin;
@@ -823,9 +824,10 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
 
             switch (str[0]) {
                 case 'a':
-                    /* possible axis argument for generalized ufunc */
+                    /* possible axes argument for generalized ufunc */
                     if (out_axes != NULL && strcmp(str, "axes") == 0) {
                         *out_axes = value;
+
                         bad_arg = 0;
                     }
                     break;
@@ -867,6 +869,17 @@ get_ufunc_arguments(PyUFuncObject *ufunc,
                         bad_arg = 0;
                     }
                     break;
+                case 'k':
+                    if (out_keepdims != NULL && strcmp(str, "keepdims") == 0) {
+                        if (!PyBool_Check(value)) {
+                            PyErr_SetString(PyExc_TypeError,
+                                        "'keepdims' must be a boolean");
+                            goto fail;
+                        }
+                        *out_keepdims = (value == Py_True);
+                        bad_arg = 0;
+                    }
+                    break;
                 case 'o':
                     /*
                      * Output arrays may be specified as a keyword argument,
@@ -1825,6 +1838,10 @@ make_full_arg_tuple(
         }
     }
 
+    /* No outputs in kwargs; if also none in args, we're done */
+    if (nargs == nin) {
+        return 0;
+    }
     /* copy across positional output arguments, adding trailing Nones */
     full_args->out = PyTuple_New(nout);
     if (full_args->out == NULL) {
@@ -1868,6 +1885,35 @@ _has_output_coredims(PyUFuncObject *ufunc) {
 }
 
 /*
+ * Check whether the gufunc can be used with keepdims, i.e., that all its
+ * input arguments have the same number of core dimension, and all output
+ * arguments have no core dimensions. Returns 0 if all is fine, and sets
+ * an error and returns -1 if not.
+ */
+static int
+_check_keepdims_support(PyUFuncObject *ufunc) {
+    int i;
+    int nin = ufunc->nin, nout = ufunc->nout;
+    int input_core_dims = ufunc->core_num_dims[0];
+    for (i = 1; i < nin + nout; i++) {
+        if (ufunc->core_num_dims[i] != (i < nin ? input_core_dims : 0)) {
+            PyErr_Format(PyExc_TypeError,
+                "%s does not support keepdims: its signature %s requires "
+                "that %s %d has %d core dimensions, but keepdims can only "
+                "be used when all inputs have the same number of core "
+                "dimensions and all outputs have no core dimensions.",
+                ufunc_get_name_cstr(ufunc),
+                ufunc->core_signature,
+                i < nin ? "input" : "output",
+                i < nin ? i : i - nin,
+                ufunc->core_num_dims[i]);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+/*
  * Interpret a possible axes keyword argument, using it to fill the remap_axis
  * array which maps default to actual axes for each operand, indexed as
  * as remap_axis[iop][iaxis]. The default axis order has first all broadcast
@@ -1876,8 +1922,8 @@ _has_output_coredims(PyUFuncObject *ufunc) {
  * Returns 0 on success, and -1 on failure
  */
 static int
-_parse_axes_arg(PyUFuncObject *ufunc, PyObject *axes, PyArrayObject **op,
-                int broadcast_ndim, int **remap_axis) {
+_parse_axes_arg(PyUFuncObject *ufunc, int core_num_dims[], PyObject *axes,
+                PyArrayObject **op, int broadcast_ndim, int **remap_axis) {
     int nin = ufunc->nin;
     int nout = ufunc->nout;
     int nop = nin + nout;
@@ -1907,7 +1953,7 @@ _parse_axes_arg(PyUFuncObject *ufunc, PyObject *axes, PyArrayObject **op,
         PyObject *op_axes_tuple, *axis_item;
         int axis, op_axis;
 
-        op_ncore = ufunc->core_num_dims[iop];
+        op_ncore = core_num_dims[iop];
         if (op[iop] != NULL) {
             op_ndim = PyArray_NDIM(op[iop]);
             op_nbroadcast = op_ndim - op_ncore;
@@ -2157,6 +2203,8 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
 
     /* Use remapped axes for generalized ufunc */
     int broadcast_ndim, iter_ndim;
+    int core_num_dims_array[NPY_MAXARGS];
+    int *core_num_dims;
     int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
     int *op_axes[NPY_MAXARGS];
 
@@ -2193,6 +2241,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     NPY_CASTING casting = NPY_DEFAULT_ASSIGN_CASTING;
     /* When provided, extobj, typetup, and axes contain borrowed references */
     PyObject *extobj = NULL, *type_tup = NULL, *axes = NULL;
+    int keepdims = -1;
 
     if (ufunc == NULL) {
         PyErr_SetString(PyExc_ValueError, "function not supported");
@@ -2219,25 +2268,53 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     /* Get all the arguments */
     retval = get_ufunc_arguments(ufunc, args, kwds,
                 op, &order, &casting, &extobj,
-                &type_tup, &subok, NULL, &axes);
+                &type_tup, &subok, NULL, &axes, &keepdims);
     if (retval < 0) {
         goto fail;
     }
-
+    /*
+     * If keepdims was passed in (and thus changed from the initial value
+     * on top), check the gufunc is suitable, i.e., that its inputs share
+     * the same number of core dimensions, and its outputs have none.
+     */
+    if (keepdims != -1) {
+        retval = _check_keepdims_support(ufunc);
+        if (retval < 0) {
+            goto fail;
+        }
+    }
+    /*
+     * If keepdims is set and true, signal all dimensions will be the same.
+     */
+    if (keepdims == 1) {
+        int num_dims = ufunc->core_num_dims[0];
+        for (i = 0; i < nop; ++i) {
+            core_num_dims_array[i] = num_dims;
+        }
+        core_num_dims = core_num_dims_array;
+    }
+    else {
+        /* keepdims was not set or was false; no adjustment necessary */
+        core_num_dims = ufunc->core_num_dims;
+        keepdims = 0;
+    }
     /*
      * Check that operands have the minimum dimensions required.
      * (Just checks core; broadcast dimensions are tested by the iterator.)
      */
     for (i = 0; i < nop; i++) {
-        if (op[i] != NULL && PyArray_NDIM(op[i]) < ufunc->core_num_dims[i]) {
+        if (op[i] != NULL && PyArray_NDIM(op[i]) < core_num_dims[i]) {
             PyErr_Format(PyExc_ValueError,
                          "%s: %s operand %d does not have enough "
                          "dimensions (has %d, gufunc core with "
                          "signature %s requires %d)",
-                         ufunc_get_name_cstr(ufunc),
+                         ufunc_name,
                          i < nin ? "Input" : "Output",
-                         i < nin ? i : i - nin, PyArray_NDIM(op[i]),
-                         ufunc->core_signature, ufunc->core_num_dims[i]);
+                         i < nin ? i : i - nin,
+                         PyArray_NDIM(op[i]),
+                         ufunc->core_signature,
+                         core_num_dims[i]);
+            retval = -1;
             goto fail;
         }
     }
@@ -2249,7 +2326,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
      */
     broadcast_ndim = 0;
     for (i = 0; i < nin; ++i) {
-        int n = PyArray_NDIM(op[i]) - ufunc->core_num_dims[i];
+        int n = PyArray_NDIM(op[i]) - core_num_dims[i];
         if (n > broadcast_ndim) {
             broadcast_ndim = n;
         }
@@ -2263,7 +2340,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
      */
     iter_ndim = broadcast_ndim;
     for (i = nin; i < nop; ++i) {
-        iter_ndim += ufunc->core_num_dims[i];
+        iter_ndim += core_num_dims[i];
     }
     if (iter_ndim > NPY_MAXDIMS) {
         PyErr_Format(PyExc_ValueError,
@@ -2285,7 +2362,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
         for (i=0; i < nop; i++) {
             remap_axis[i] = remap_axis_memory + i * NPY_MAXDIMS;
         }
-        retval = _parse_axes_arg(ufunc, axes, op, broadcast_ndim,
+        retval = _parse_axes_arg(ufunc, core_num_dims, axes, op, broadcast_ndim,
                                  remap_axis);
         if(retval < 0) {
             goto fail;
@@ -2307,12 +2384,13 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     j = broadcast_ndim;
     for (i = 0; i < nop; ++i) {
         int n;
+
         if (op[i]) {
             /*
              * Note that n may be negative if broadcasting
              * extends into the core dimensions.
              */
-            n = PyArray_NDIM(op[i]) - ufunc->core_num_dims[i];
+            n = PyArray_NDIM(op[i]) - core_num_dims[i];
         }
         else {
             n = broadcast_ndim;
@@ -2336,10 +2414,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
         /* Except for when it belongs to this output */
         if (i >= nin) {
             int dim_offset = ufunc->core_offsets[i];
-            int num_dims = ufunc->core_num_dims[i];
-            /* Fill in 'iter_shape' and 'op_axes' for this output */
+            int num_dims = core_num_dims[i];
+            /*
+             * Fill in 'iter_shape' and 'op_axes' for the core dimensions
+             * of this output. Here, we have to be careful: if keepdims
+             * was used, then this axis is not a real core dimension,
+             * but is being added back for broadcasting, so its size is 1.
+             */
             for (idim = 0; idim < num_dims; ++idim) {
-                iter_shape[j] = core_dim_sizes[
+                iter_shape[j] = keepdims ? 1 : core_dim_sizes[
                                         ufunc->core_dim_ixs[dim_offset + idim]];
                 op_axes_arrays[i][j] = REMAP_AXIS(i, n + idim);
                 ++j;
@@ -2721,7 +2804,7 @@ PyUFunc_GenericFunction(PyUFuncObject *ufunc,
     /* Get all the arguments */
     retval = get_ufunc_arguments(ufunc, args, kwds,
                 op, &order, &casting, &extobj,
-                &type_tup, &subok, &wheremask, NULL);
+                &type_tup, &subok, &wheremask, NULL, NULL);
     if (retval < 0) {
         goto fail;
     }
@@ -4282,11 +4365,9 @@ static PyObject *
 ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
 {
     int i;
-    PyTupleObject *ret;
     PyArrayObject *mps[NPY_MAXARGS];
     PyObject *retobj[NPY_MAXARGS];
     PyObject *wraparr[NPY_MAXARGS];
-    PyObject *res;
     PyObject *override = NULL;
     ufunc_full_args full_args = {NULL, NULL};
     int errval;
@@ -4363,13 +4444,17 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         int j = ufunc->nin+i;
         PyObject *wrap = wraparr[i];
 
-        if (wrap != NULL) {
+        if (wrap == NULL) {
+            /* default behavior */
+            retobj[i] = PyArray_Return(mps[j]);
+        }
+        else if (wrap == Py_None) {
+            Py_DECREF(wrap);
+            retobj[i] = (PyObject *)mps[j];
+        }
+        else {
+            PyObject *res;
             PyObject *args_tup;
-            if (wrap == Py_None) {
-                Py_DECREF(wrap);
-                retobj[i] = (PyObject *)mps[j];
-                continue;
-            }
 
             /* Call the method with appropriate context */
             args_tup = _get_wrap_prepare_args(full_args);
@@ -4389,15 +4474,9 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
             if (res == NULL) {
                 goto fail;
             }
-            else {
-                Py_DECREF(mps[j]);
-                retobj[i] = res;
-                continue;
-            }
-        }
-        else {
-            /* default behavior */
-            retobj[i] = PyArray_Return(mps[j]);
+
+            Py_DECREF(mps[j]);
+            retobj[i] = res;
         }
     }
 
@@ -4408,6 +4487,8 @@ ufunc_generic_call(PyUFuncObject *ufunc, PyObject *args, PyObject *kwds)
         return retobj[0];
     }
     else {
+        PyTupleObject *ret;
+
         ret = (PyTupleObject *)PyTuple_New(ufunc->nout);
         for (i = 0; i < ufunc->nout; i++) {
             PyTuple_SET_ITEM(ret, i, retobj[i]);
@@ -4523,7 +4604,7 @@ PyUFunc_FromFuncAndData(PyUFuncGenericFunction *func, void **data,
                         const char *name, const char *doc, int unused)
 {
     return PyUFunc_FromFuncAndDataAndSignature(func, data, types, ntypes,
-        nin, nout, identity, name, doc, 0, NULL);
+        nin, nout, identity, name, doc, unused, NULL);
 }
 
 /*UFUNC_API*/
diff --git a/numpy/core/src/umath/umathmodule.c b/numpy/core/src/umath/umathmodule.c
index 15da831b2..5567b9bbf 100644
--- a/numpy/core/src/umath/umathmodule.c
+++ b/numpy/core/src/umath/umathmodule.c
@@ -87,11 +87,12 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     /* Keywords are ignored for now */
 
     PyObject *function, *pyname = NULL;
-    int nin, nout, i;
+    int nin, nout, i, nargs;
     PyUFunc_PyFuncData *fdata;
     PyUFuncObject *self;
-    char *fname, *str;
+    char *fname, *str, *types, *doc;
     Py_ssize_t fname_len = -1;
+    void * ptr, **data;
     int offset[2];
 
     if (!PyArg_ParseTuple(args, "Oii:frompyfunc", &function, &nin, &nout)) {
@@ -101,43 +102,7 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
         PyErr_SetString(PyExc_TypeError, "function must be callable");
         return NULL;
     }
-    if (nin + nout > NPY_MAXARGS) {
-        PyErr_Format(PyExc_ValueError,
-                     "Cannot construct a ufunc with more than %d operands "
-                     "(requested number were: inputs = %d and outputs = %d)",
-                     NPY_MAXARGS, nin, nout);
-        return NULL;
-    }
-    self = PyArray_malloc(sizeof(PyUFuncObject));
-    if (self == NULL) {
-        return NULL;
-    }
-    PyObject_Init((PyObject *)self, &PyUFunc_Type);
-
-    self->userloops = NULL;
-    self->nin = nin;
-    self->nout = nout;
-    self->nargs = nin + nout;
-    self->identity = PyUFunc_None;
-    self->functions = pyfunc_functions;
-    self->ntypes = 1;
-
-    /* generalized ufunc */
-    self->core_enabled = 0;
-    self->core_num_dim_ix = 0;
-    self->core_num_dims = NULL;
-    self->core_dim_ixs = NULL;
-    self->core_offsets = NULL;
-    self->core_signature = NULL;
-    self->op_flags = PyArray_malloc(sizeof(npy_uint32)*self->nargs);
-    if (self->op_flags == NULL) {
-        return PyErr_NoMemory();
-    }
-    memset(self->op_flags, 0, sizeof(npy_uint32)*self->nargs);
-    self->iter_flags = 0;
-
-    self->type_resolver = &object_ufunc_type_resolver;
-    self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
+    nargs = nin + nout;
 
     pyname = PyObject_GetAttrString(function, "__name__");
     if (pyname) {
@@ -150,7 +115,7 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     }
 
     /*
-     * self->ptr holds a pointer for enough memory for
+     * ptr will be assigned to self->ptr, holds a pointer for enough memory for
      * self->data[0] (fdata)
      * self->data
      * self->name
@@ -164,39 +129,51 @@ ufunc_frompyfunc(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUS
     if (i) {
         offset[0] += (sizeof(void *) - i);
     }
-    offset[1] = self->nargs;
-    i = (self->nargs % sizeof(void *));
+    offset[1] = nargs;
+    i = (nargs % sizeof(void *));
     if (i) {
         offset[1] += (sizeof(void *)-i);
     }
-    self->ptr = PyArray_malloc(offset[0] + offset[1] + sizeof(void *) +
+    ptr = PyArray_malloc(offset[0] + offset[1] + sizeof(void *) +
                             (fname_len + 14));
-    if (self->ptr == NULL) {
+    if (ptr == NULL) {
         Py_XDECREF(pyname);
         return PyErr_NoMemory();
     }
-    Py_INCREF(function);
-    self->obj = function;
-    fdata = (PyUFunc_PyFuncData *)(self->ptr);
+    fdata = (PyUFunc_PyFuncData *)(ptr);
+    fdata->callable = function;
     fdata->nin = nin;
     fdata->nout = nout;
-    fdata->callable = function;
 
-    self->data = (void **)(((char *)self->ptr) + offset[0]);
-    self->data[0] = (void *)fdata;
-    self->types = (char *)self->data + sizeof(void *);
-    for (i = 0; i < self->nargs; i++) {
-        self->types[i] = NPY_OBJECT;
+    data = (void **)(((char *)ptr) + offset[0]);
+    data[0] = (void *)fdata;
+    types = (char *)data + sizeof(void *);
+    for (i = 0; i < nargs; i++) {
+        types[i] = NPY_OBJECT;
     }
-    str = self->types + offset[1];
+    str = types + offset[1];
     memcpy(str, fname, fname_len);
     memcpy(str+fname_len, " (vectorized)", 14);
-    self->name = str;
-
     Py_XDECREF(pyname);
 
     /* Do a better job someday */
-    self->doc = "dynamic ufunc based on a python function";
+    doc = "dynamic ufunc based on a python function";
+
+    self = (PyUFuncObject *)PyUFunc_FromFuncAndData(
+            (PyUFuncGenericFunction *)pyfunc_functions, data,
+            types, /* ntypes */ 1, nin, nout, PyUFunc_None,
+            str, doc, /* unused */ 0);
+
+    if (self == NULL) {
+        PyArray_free(ptr);
+        return NULL;
+    }
+    Py_INCREF(function);
+    self->obj = function;
+    self->ptr = ptr;
+
+    self->type_resolver = &object_ufunc_type_resolver;
+    self->legacy_inner_loop_selector = &object_ufunc_loop_selector;
 
     return (PyObject *)self;
 }
diff --git a/numpy/core/tests/test_api.py b/numpy/core/tests/test_api.py
index a927968a4..9755e7b36 100644
--- a/numpy/core/tests/test_api.py
+++ b/numpy/core/tests/test_api.py
@@ -223,22 +223,25 @@ def test_array_astype():
     b = a.astype('f4', subok=0, copy=False)
     assert_(a is b)
 
-    a = np.matrix([[0, 1, 2], [3, 4, 5]], dtype='f4')
+    class MyNDArray(np.ndarray):
+        pass
 
-    # subok=True passes through a matrix
+    a = np.array([[0, 1, 2], [3, 4, 5]], dtype='f4').view(MyNDArray)
+
+    # subok=True passes through a subclass
     b = a.astype('f4', subok=True, copy=False)
     assert_(a is b)
 
     # subok=True is default, and creates a subtype on a cast
     b = a.astype('i4', copy=False)
     assert_equal(a, b)
-    assert_equal(type(b), np.matrix)
+    assert_equal(type(b), MyNDArray)
 
-    # subok=False never returns a matrix
+    # subok=False never returns a subclass
     b = a.astype('f4', subok=False, copy=False)
     assert_equal(a, b)
     assert_(not (a is b))
-    assert_(type(b) is not np.matrix)
+    assert_(type(b) is not MyNDArray)
 
     # Make sure converting from string object to fixed length string
     # does not truncate.
diff --git a/numpy/core/tests/test_datetime.py b/numpy/core/tests/test_datetime.py
index dca2d2541..e433877e8 100644
--- a/numpy/core/tests/test_datetime.py
+++ b/numpy/core/tests/test_datetime.py
@@ -124,7 +124,7 @@ class TestDateTime(object):
         assert_(not np.can_cast('M8[h]', 'M8', casting='safe'))
 
     def test_compare_generic_nat(self):
-        # regression tests for GH6452
+        # regression tests for gh-6452
         assert_equal(np.datetime64('NaT'),
                      np.datetime64('2000') + np.timedelta64('NaT'))
         # nb. we may want to make NaT != NaT true in the future
@@ -236,18 +236,25 @@ class TestDateTime(object):
         # find "supertype" for non-dates and dates
 
         b = np.bool_(True)
-        dt = np.datetime64('1970-01-01', 'M')
-        arr = np.array([b, dt])
+        dm = np.datetime64('1970-01-01', 'M')
+        d = datetime.date(1970, 1, 1)
+        dt = datetime.datetime(1970, 1, 1, 12, 30, 40)
+
+        arr = np.array([b, dm])
         assert_equal(arr.dtype, np.dtype('O'))
 
-        dt = datetime.date(1970, 1, 1)
-        arr = np.array([b, dt])
+        arr = np.array([b, d])
         assert_equal(arr.dtype, np.dtype('O'))
 
-        dt = datetime.datetime(1970, 1, 1, 12, 30, 40)
         arr = np.array([b, dt])
         assert_equal(arr.dtype, np.dtype('O'))
 
+        arr = np.array([d, d]).astype('datetime64')
+        assert_equal(arr.dtype, np.dtype('M8[D]'))
+
+        arr = np.array([dt, dt]).astype('datetime64')
+        assert_equal(arr.dtype, np.dtype('M8[us]'))
+
     def test_timedelta_scalar_construction(self):
         # Construct with different units
         assert_equal(np.timedelta64(7, 'D'),
@@ -324,6 +331,24 @@ class TestDateTime(object):
         a = np.timedelta64(1, 'Y')
         assert_raises(TypeError, np.timedelta64, a, 'D')
         assert_raises(TypeError, np.timedelta64, a, 'm')
+        a = datetime.timedelta(seconds=3)
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = datetime.timedelta(weeks=3)
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+        a = datetime.timedelta()
+        assert_raises(TypeError, np.timedelta64, a, 'M')
+        assert_raises(TypeError, np.timedelta64, a, 'Y')
+
+    def test_timedelta_object_array_conversion(self):
+        # Regression test for gh-11096
+        inputs = [datetime.timedelta(28),
+                  datetime.timedelta(30),
+                  datetime.timedelta(31)]
+        expected = np.array([28, 30, 31], dtype='timedelta64[D]')
+        actual = np.array(inputs, dtype='timedelta64[D]')
+        assert_equal(expected, actual)
 
     def test_timedelta_scalar_construction_units(self):
         # String construction detecting units
diff --git a/numpy/core/tests/test_deprecations.py b/numpy/core/tests/test_deprecations.py
index 5d59d8226..60a7c72f7 100644
--- a/numpy/core/tests/test_deprecations.py
+++ b/numpy/core/tests/test_deprecations.py
@@ -134,6 +134,22 @@ class _VisibleDeprecationTestCase(_DeprecationTestCase):
     warning_cls = np.VisibleDeprecationWarning
 
 
+class TestNonTupleNDIndexDeprecation(object):
+    def test_basic(self):
+        a = np.zeros((5, 5))
+        with warnings.catch_warnings():
+            warnings.filterwarnings('always')
+            assert_warns(FutureWarning, a.__getitem__, [[0, 1], [0, 1]])
+            assert_warns(FutureWarning, a.__getitem__, [slice(None)])
+
+            warnings.filterwarnings('error')
+            assert_raises(FutureWarning, a.__getitem__, [[0, 1], [0, 1]])
+            assert_raises(FutureWarning, a.__getitem__, [slice(None)])
+
+            # a a[[0, 1]] always was advanced indexing, so no error/warning
+            a[[0, 1]]
+
+
 class TestRankDeprecation(_DeprecationTestCase):
     """Test that np.rank is deprecated. The function should simply be
     removed. The VisibleDeprecationWarning may become unnecessary.
diff --git a/numpy/core/tests/test_einsum.py b/numpy/core/tests/test_einsum.py
index 104dd1986..63e75ff7a 100644
--- a/numpy/core/tests/test_einsum.py
+++ b/numpy/core/tests/test_einsum.py
@@ -491,8 +491,16 @@ class TestEinSum(object):
         assert_array_equal(np.einsum('ij,ij->j', p, q, optimize=True),
                            [10.] * 2)
 
-        p = np.ones((1, 5))
-        q = np.ones((5, 5))
+        # a blas-compatible contraction broadcasting case which was failing
+        # for optimize=True (ticket #10930)
+        x = np.array([2., 3.])
+        y = np.array([4.])
+        assert_array_equal(np.einsum("i, i", x, y, optimize=False), 20.)
+        assert_array_equal(np.einsum("i, i", x, y, optimize=True), 20.)
+
+        # all-ones array was bypassing bug (ticket #10930)
+        p = np.ones((1, 5)) / 2
+        q = np.ones((5, 5)) / 2
         for optimize in (True, False):
             assert_array_equal(np.einsum("...ij,...jk->...ik", p, p,
                                          optimize=optimize),
@@ -500,7 +508,7 @@ class TestEinSum(object):
                                          optimize=optimize))
             assert_array_equal(np.einsum("...ij,...jk->...ik", p, q,
                                          optimize=optimize),
-                               np.full((1, 5), 5))
+                               np.full((1, 5), 1.25))
 
         # Cases which were failing (gh-10899)
         x = np.eye(2, dtype=dtype)
@@ -596,6 +604,17 @@ class TestEinSum(object):
                      [[[1,  3], [3,  9], [5, 15], [7, 21]],
                      [[8, 16], [16, 32], [24, 48], [32, 64]]])
 
+    def test_subscript_range(self):
+        # Issue #7741, make sure that all letters of Latin alphabet (both uppercase & lowercase) can be used
+        # when creating a subscript from arrays
+        a = np.ones((2, 3))
+        b = np.ones((3, 4))
+        np.einsum(a, [0, 20], b, [20, 2], [0, 2], optimize=False)
+        np.einsum(a, [0, 27], b, [27, 2], [0, 2], optimize=False)
+        np.einsum(a, [0, 51], b, [51, 2], [0, 2], optimize=False)
+        assert_raises(ValueError, lambda: np.einsum(a, [0, 52], b, [52, 2], [0, 2], optimize=False))
+        assert_raises(ValueError, lambda: np.einsum(a, [-1, 5], b, [5, 2], [-1, 2], optimize=False))
+        
     def test_einsum_broadcast(self):
         # Issue #2455 change in handling ellipsis
         # remove the 'middle broadcast' error
diff --git a/numpy/core/tests/test_indexing.py b/numpy/core/tests/test_indexing.py
index 65852e577..88f5deabc 100644
--- a/numpy/core/tests/test_indexing.py
+++ b/numpy/core/tests/test_indexing.py
@@ -576,19 +576,6 @@ class TestSubclasses(object):
         assert_(isinstance(s[[0, 1, 2]], SubClass))
         assert_(isinstance(s[s > 0], SubClass))
 
-    def test_matrix_fancy(self):
-        # The matrix class messes with the shape. While this is always
-        # weird (getitem is not used, it does not have setitem nor knows
-        # about fancy indexing), this tests gh-3110
-        m = np.matrix([[1, 2], [3, 4]])
-
-        assert_(isinstance(m[[0,1,0], :], np.matrix))
-
-        # gh-3110. Note the transpose currently because matrices do *not*
-        # support dimension fixing for fancy indexing correctly.
-        x = np.asmatrix(np.arange(50).reshape(5,10))
-        assert_equal(x[:2, np.array(-1)], x[:2, -1].T)
-
     def test_finalize_gets_full_info(self):
         # Array finalize should be called on the filled array.
         class SubClass(np.ndarray):
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index f3032d394..a60f2cd92 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -1745,13 +1745,6 @@ class TestMethods(object):
         assert_equal(r, np.array([('a', 1), ('c', 3), ('b', 255), ('d', 258)],
                                  dtype=mydtype))
 
-    def test_sort_matrix_none(self):
-        a = np.matrix([[2, 1, 0]])
-        actual = np.sort(a, axis=None)
-        expected = np.matrix([[0, 1, 2]])
-        assert_equal(actual, expected)
-        assert_(type(expected) is np.matrix)
-
     def test_argsort(self):
         # all c scalar argsorts use the same code with different types
         # so it suffices to run a quick check with one type. The number
@@ -2497,14 +2490,6 @@ class TestMethods(object):
                 assert_array_equal(np.partition(d, kth)[kth], tgt,
                                    err_msg="data: %r\n kth: %r" % (d, kth))
 
-    def test_partition_matrix_none(self):
-        # gh-4301
-        a = np.matrix([[2, 1, 0]])
-        actual = np.partition(a, 1, axis=None)
-        expected = np.matrix([[0, 1, 2]])
-        assert_equal(actual, expected)
-        assert_(type(expected) is np.matrix)
-
     def test_argpartition_gh5524(self):
         #  A test for functionality of argpartition on lists.
         d = [6,7,3,2,9,0]
@@ -3332,7 +3317,39 @@ class TestBinop(object):
         with assert_raises(NotImplementedError):
             a ** 2
 
+    def test_pow_array_object_dtype(self):
+        # test pow on arrays of object dtype
+        class SomeClass(object):
+            def __init__(self, num=None):
+                self.num = num
+
+            # want to ensure a fast pow path is not taken
+            def __mul__(self, other):
+                raise AssertionError('__mul__ should not be called')
+
+            def __div__(self, other):
+                raise AssertionError('__div__ should not be called')
+            
+            def __pow__(self, exp):
+                return SomeClass(num=self.num ** exp)
+
+            def __eq__(self, other):
+                if isinstance(other, SomeClass):
+                    return self.num == other.num
+
+            __rpow__ = __pow__
 
+        def pow_for(exp, arr):
+            return np.array([x ** exp for x in arr])
+
+        obj_arr = np.array([SomeClass(1), SomeClass(2), SomeClass(3)])
+
+        assert_equal(obj_arr ** 0.5, pow_for(0.5, obj_arr))
+        assert_equal(obj_arr ** 0, pow_for(0, obj_arr))
+        assert_equal(obj_arr ** 1, pow_for(1, obj_arr))
+        assert_equal(obj_arr ** -1, pow_for(-1, obj_arr))
+        assert_equal(obj_arr ** 2, pow_for(2, obj_arr))
+        
 class TestTemporaryElide(object):
     # elision is only triggered on relatively large arrays
 
@@ -5279,13 +5296,6 @@ class TestDot(object):
         assert_equal(np.dot(b, a), res)
         assert_equal(np.dot(b, b), res)
 
-    def test_dot_scalar_and_matrix_of_objects(self):
-        # Ticket #2469
-        arr = np.matrix([1, 2], dtype=object)
-        desired = np.matrix([[3, 6]], dtype=object)
-        assert_equal(np.dot(arr, 3), desired)
-        assert_equal(np.dot(3, arr), desired)
-
     def test_accelerate_framework_sgemv_fix(self):
 
         def aligned_array(shape, align, dtype, order='C'):
@@ -5641,21 +5651,6 @@ class TestInner(object):
             assert_equal(np.inner(vec, sca), desired)
             assert_equal(np.inner(sca, vec), desired)
 
-    def test_inner_scalar_and_matrix(self):
-        for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
-            sca = np.array(3, dtype=dt)[()]
-            arr = np.matrix([[1, 2], [3, 4]], dtype=dt)
-            desired = np.matrix([[3, 6], [9, 12]], dtype=dt)
-            assert_equal(np.inner(arr, sca), desired)
-            assert_equal(np.inner(sca, arr), desired)
-
-    def test_inner_scalar_and_matrix_of_objects(self):
-        # Ticket #4482
-        arr = np.matrix([1, 2], dtype=object)
-        desired = np.matrix([[3, 6]], dtype=object)
-        assert_equal(np.inner(arr, 3), desired)
-        assert_equal(np.inner(3, arr), desired)
-
     def test_vecself(self):
         # Ticket 844.
         # Inner product of a vector with itself segfaults or give
@@ -6522,6 +6517,7 @@ class TestNewBufferProtocol(object):
         a = np.empty((1,) * 32)
         self._check_roundtrip(a)
 
+    @pytest.mark.skipif(sys.version_info < (2, 7, 7), reason="See gh-11115")
     def test_error_too_many_dims(self):
         def make_ctype(shape, scalar_type):
             t = scalar_type
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index b6b1c0f31..a0096efdb 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -1469,26 +1469,25 @@ def test_iter_allocate_output_types_scalar():
 
 def test_iter_allocate_output_subtype():
     # Make sure that the subtype with priority wins
+    class MyNDArray(np.ndarray):
+        __array_priority__ = 15
 
-    # matrix vs ndarray
-    a = np.matrix([[1, 2], [3, 4]])
+    # subclass vs ndarray
+    a = np.array([[1, 2], [3, 4]]).view(MyNDArray)
     b = np.arange(4).reshape(2, 2).T
     i = nditer([a, b, None], [],
-                    [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+               [['readonly'], ['readonly'], ['writeonly', 'allocate']])
     assert_equal(type(a), type(i.operands[2]))
-    assert_(type(b) != type(i.operands[2]))
+    assert_(type(b) is not type(i.operands[2]))
     assert_equal(i.operands[2].shape, (2, 2))
 
-    # matrix always wants things to be 2D
-    b = np.arange(4).reshape(1, 2, 2)
-    assert_raises(RuntimeError, nditer, [a, b, None], [],
-                    [['readonly'], ['readonly'], ['writeonly', 'allocate']])
-    # but if subtypes are disabled, the result can still work
+    # If subtypes are disabled, we should get back an ndarray.
     i = nditer([a, b, None], [],
-            [['readonly'], ['readonly'], ['writeonly', 'allocate', 'no_subtype']])
+               [['readonly'], ['readonly'],
+                ['writeonly', 'allocate', 'no_subtype']])
     assert_equal(type(b), type(i.operands[2]))
-    assert_(type(a) != type(i.operands[2]))
-    assert_equal(i.operands[2].shape, (1, 2, 2))
+    assert_(type(a) is not type(i.operands[2]))
+    assert_equal(i.operands[2].shape, (2, 2))
 
 def test_iter_allocate_output_errors():
     # Check that the iterator will throw errors for bad output allocations
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index 40cccd404..53486dc51 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -552,7 +552,6 @@ class TestFloatExceptions(object):
         self.assert_raises_fpe(fpeerr, flop, sc1, sc2[()])
         self.assert_raises_fpe(fpeerr, flop, sc1[()], sc2[()])
 
-    @pytest.mark.xfail(reason="See ticket #2350")
     def test_floating_exceptions(self):
         # Test basic arithmetic function errors
         with np.errstate(all='raise'):
@@ -905,7 +904,7 @@ class TestTypes(object):
             fi = np.finfo(dt)
             assert_(np.can_cast(fi.min, dt))
             assert_(np.can_cast(fi.max, dt))
-            
+
 
 # Custom exception class to test exception propagation in fromiter
 class NIterError(Exception):
@@ -2201,13 +2200,16 @@ class TestLikeFuncs(object):
             self.compare_array_value(dz, value, fill_value)
 
         # Test the 'subok' parameter
-        a = np.matrix([[1, 2], [3, 4]])
+        class MyNDArray(np.ndarray):
+            pass
+
+        a = np.array([[1, 2], [3, 4]]).view(MyNDArray)
 
         b = like_function(a, **fill_kwarg)
-        assert_(type(b) is np.matrix)
+        assert_(type(b) is MyNDArray)
 
         b = like_function(a, subok=False, **fill_kwarg)
-        assert_(type(b) is not np.matrix)
+        assert_(type(b) is not MyNDArray)
 
     def test_ones_like(self):
         self.check_like_function(np.ones_like, 1)
diff --git a/numpy/core/tests/test_scalarprint.py b/numpy/core/tests/test_scalarprint.py
index 94d8294f1..a20ec9f74 100644
--- a/numpy/core/tests/test_scalarprint.py
+++ b/numpy/core/tests/test_scalarprint.py
@@ -4,9 +4,10 @@
 """
 from __future__ import division, absolute_import, print_function
 
-import tempfile
+import code, sys
+from tempfile import TemporaryFile
 import numpy as np
-from numpy.testing import assert_, assert_equal
+from numpy.testing import assert_, assert_equal, suppress_warnings
 
 
 class TestRealScalars(object):
@@ -53,7 +54,7 @@ class TestRealScalars(object):
         # output to a "real file" (ie, not a StringIO). Make sure we don't
         # inherit it.
         x = np.double(0.1999999999999)
-        with tempfile.TemporaryFile('r+t') as f:
+        with TemporaryFile('r+t') as f:
             print(x, file=f)
             f.seek(0)
             output = f.read()
@@ -62,6 +63,37 @@ class TestRealScalars(object):
         # precision as '0.2', but we want numpy's np.double('0.1999999999999')
         # to print the unique value, '0.1999999999999'.
 
+        # gh-11031
+        # Only in the python2 interactive shell and when stdout is a "real"
+        # file, the output of the last command is printed to stdout without
+        # Py_PRINT_RAW (unlike the print statement) so `>>> x` and `>>> print
+        # x` are potentially different. Make sure they are the same. The only
+        # way I found to get prompt-like output is using an actual prompt from
+        # the 'code' module. Again, must use tempfile to get a "real" file.
+
+        # dummy user-input which enters one line and then ctrl-Ds.
+        def userinput():
+            yield 'np.sqrt(2)'
+            raise EOFError
+        gen = userinput()
+        input_func = lambda prompt="": next(gen)
+
+        with TemporaryFile('r+t') as fo, TemporaryFile('r+t') as fe:
+            orig_stdout, orig_stderr = sys.stdout, sys.stderr
+            sys.stdout, sys.stderr = fo, fe
+
+            # py2 code.interact sends irrelevant internal DeprecationWarnings
+            with suppress_warnings() as sup:
+                sup.filter(DeprecationWarning)
+                code.interact(local={'np': np}, readfunc=input_func, banner='')
+
+            sys.stdout, sys.stderr = orig_stdout, orig_stderr
+
+            fo.seek(0)
+            capture = fo.read().strip()
+
+        assert_equal(capture, repr(np.sqrt(2)))
+
     def test_dragon4(self):
         # these tests are adapted from Ryan Juckett's dragon4 implementation,
         # see dragon4.c for details.
diff --git a/numpy/core/tests/test_shape_base.py b/numpy/core/tests/test_shape_base.py
index 1d91a651e..72b3451a4 100644
--- a/numpy/core/tests/test_shape_base.py
+++ b/numpy/core/tests/test_shape_base.py
@@ -364,10 +364,6 @@ def test_stack():
                         stack, [np.zeros((3, 3)), np.zeros(3)], axis=1)
     assert_raises_regex(ValueError, 'must have the same shape',
                         stack, [np.arange(2), np.arange(3)])
-    # np.matrix
-    m = np.matrix([[1, 2], [3, 4]])
-    assert_raises_regex(ValueError, 'shape too large to be a matrix',
-                        stack, [m, m])
 
 
 class TestBlock(object):
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index fe40456d5..b7fda3f2e 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -5,6 +5,7 @@ import itertools
 
 import numpy as np
 import numpy.core._umath_tests as umt
+import numpy.linalg._umath_linalg as uml
 import numpy.core._operand_flag_tests as opflag_tests
 import numpy.core._rational_tests as _rational_tests
 from numpy.testing import (
@@ -284,10 +285,16 @@ class TestUfunc(object):
     def test_signature(self):
         # the arguments to test_signature are: nin, nout, core_signature
         # pass
-        assert_equal(umt.test_signature(2, 1, "(i),(i)->()"), 1)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(i),(i)->()")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (1,  1,  0))
+        assert_equal(ixs, (0, 0))
 
-        # pass. empty core signature; treat as plain ufunc (with trivial core)
-        assert_equal(umt.test_signature(2, 1, "(),()->()"), 0)
+        # empty core signature; treat as plain ufunc (with trivial core)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(),()->()")
+        assert_equal(enabled, 0)
+        assert_equal(num_dims, (0,  0,  0))
+        assert_equal(ixs, ())
 
         # in the following calls, a ValueError should be raised because
         # of error in core signature
@@ -326,7 +333,10 @@ class TestUfunc(object):
             pass
 
         # more complicated names for variables
-        assert_equal(umt.test_signature(2, 1, "(i1,i2),(J_1)->(_kAB)"), 1)
+        enabled, num_dims, ixs = umt.test_signature(2, 1, "(i1,i2),(J_1)->(_kAB)")
+        assert_equal(enabled, 1)
+        assert_equal(num_dims, (2, 1, 1))
+        assert_equal(ixs, (0, 1, 2, 3))
 
     def test_get_signature(self):
         assert_equal(umt.inner1d.signature, "(i),(i)->()")
@@ -611,49 +621,49 @@ class TestUfunc(object):
 
     def test_axes_argument(self):
         # inner1d signature: '(i),(i)->()'
-        in1d = umt.inner1d
+        inner1d = umt.inner1d
         a = np.arange(27.).reshape((3, 3, 3))
         b = np.arange(10., 19.).reshape((3, 1, 3))
         # basic tests on inputs (outputs tested below with matrix_multiply).
-        c = in1d(a, b)
+        c = inner1d(a, b)
         assert_array_equal(c, (a * b).sum(-1))
         # default
-        c = in1d(a, b, axes=[(-1,), (-1,), ()])
+        c = inner1d(a, b, axes=[(-1,), (-1,), ()])
         assert_array_equal(c, (a * b).sum(-1))
         # integers ok for single axis.
-        c = in1d(a, b, axes=[-1, -1, ()])
+        c = inner1d(a, b, axes=[-1, -1, ()])
         assert_array_equal(c, (a * b).sum(-1))
         # mix fine
-        c = in1d(a, b, axes=[(-1,), -1, ()])
+        c = inner1d(a, b, axes=[(-1,), -1, ()])
         assert_array_equal(c, (a * b).sum(-1))
         # can omit last axis.
-        c = in1d(a, b, axes=[-1, -1])
+        c = inner1d(a, b, axes=[-1, -1])
         assert_array_equal(c, (a * b).sum(-1))
         # can pass in other types of integer (with __index__ protocol)
-        c = in1d(a, b, axes=[np.int8(-1), np.array(-1, dtype=np.int32)])
+        c = inner1d(a, b, axes=[np.int8(-1), np.array(-1, dtype=np.int32)])
         assert_array_equal(c, (a * b).sum(-1))
         # swap some axes
-        c = in1d(a, b, axes=[0, 0])
+        c = inner1d(a, b, axes=[0, 0])
         assert_array_equal(c, (a * b).sum(0))
-        c = in1d(a, b, axes=[0, 2])
+        c = inner1d(a, b, axes=[0, 2])
         assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1))
         # Check errors for improperly constructed axes arguments.
         # should have list.
-        assert_raises(TypeError, in1d, a, b, axes=-1)
+        assert_raises(TypeError, inner1d, a, b, axes=-1)
         # needs enough elements
-        assert_raises(ValueError, in1d, a, b, axes=[-1])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1])
         # should pass in indices.
-        assert_raises(TypeError, in1d, a, b, axes=[-1.0, -1.0])
-        assert_raises(TypeError, in1d, a, b, axes=[(-1.0,), -1])
-        assert_raises(TypeError, in1d, a, b, axes=[None, 1])
+        assert_raises(TypeError, inner1d, a, b, axes=[-1.0, -1.0])
+        assert_raises(TypeError, inner1d, a, b, axes=[(-1.0,), -1])
+        assert_raises(TypeError, inner1d, a, b, axes=[None, 1])
         # cannot pass an index unless there is only one dimension
         # (output is wrong in this case)
-        assert_raises(TypeError, in1d, a, b, axes=[-1, -1, -1])
+        assert_raises(TypeError, inner1d, a, b, axes=[-1, -1, -1])
         # or pass in generally the wrong number of axes
-        assert_raises(ValueError, in1d, a, b, axes=[-1, -1, (-1,)])
-        assert_raises(ValueError, in1d, a, b, axes=[-1, (-2, -1), ()])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1, -1, (-1,)])
+        assert_raises(ValueError, inner1d, a, b, axes=[-1, (-2, -1), ()])
         # axes need to have same length.
-        assert_raises(ValueError, in1d, a, b, axes=[0, 1])
+        assert_raises(ValueError, inner1d, a, b, axes=[0, 1])
 
         # matrix_multiply signature: '(m,n),(n,p)->(m,p)'
         mm = umt.matrix_multiply
@@ -708,6 +718,73 @@ class TestUfunc(object):
         assert_raises(ValueError, mm, z[1], z, axes=[0, 1])
         assert_raises(ValueError, mm, z, z, out=z[0], axes=[0, 1])
 
+    def test_keepdims_argument(self):
+        # inner1d signature: '(i),(i)->()'
+        inner1d = umt.inner1d
+        a = np.arange(27.).reshape((3, 3, 3))
+        b = np.arange(10., 19.).reshape((3, 1, 3))
+        c = inner1d(a, b)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        out = np.zeros_like(c)
+        d = inner1d(a, b, keepdims=True, out=out)
+        assert_(d is out)
+        assert_array_equal(d, c)
+        # Now combined with axes.
+        c = inner1d(a, b, axes=[(-1,), (-1,), ()], keepdims=False)
+        assert_array_equal(c, (a * b).sum(-1))
+        c = inner1d(a, b, axes=[(-1,), (-1,), (-1,)], keepdims=True)
+        assert_array_equal(c, (a * b).sum(-1, keepdims=True))
+        c = inner1d(a, b, axes=[0, 0], keepdims=False)
+        assert_array_equal(c, (a * b).sum(0))
+        c = inner1d(a, b, axes=[0, 0, 0], keepdims=True)
+        assert_array_equal(c, (a * b).sum(0, keepdims=True))
+        c = inner1d(a, b, axes=[0, 2], keepdims=False)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1))
+        c = inner1d(a, b, axes=[0, 2], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1,
+                                                             keepdims=True))
+        c = inner1d(a, b, axes=[0, 2, 2], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 2, 0) * b).sum(-1,
+                                                             keepdims=True))
+        c = inner1d(a, b, axes=[0, 2, 0], keepdims=True)
+        assert_array_equal(c, (a * b.transpose(2, 0, 1)).sum(0, keepdims=True))
+        # Hardly useful, but should work.
+        c = inner1d(a, b, axes=[0, 2, 1], keepdims=True)
+        assert_array_equal(c, (a.transpose(1, 0, 2) * b.transpose(0, 2, 1))
+                           .sum(1, keepdims=True))
+        # Check with two core dimensions.
+        a = np.eye(3) * np.arange(4.)[:, np.newaxis, np.newaxis]
+        expected = uml.det(a)
+        c = uml.det(a, keepdims=False)
+        assert_array_equal(c, expected)
+        c = uml.det(a, keepdims=True)
+        assert_array_equal(c, expected[:, np.newaxis, np.newaxis])
+        a = np.eye(3) * np.arange(4.)[:, np.newaxis, np.newaxis]
+        expected_s, expected_l = uml.slogdet(a)
+        cs, cl = uml.slogdet(a, keepdims=False)
+        assert_array_equal(cs, expected_s)
+        assert_array_equal(cl, expected_l)
+        cs, cl = uml.slogdet(a, keepdims=True)
+        assert_array_equal(cs, expected_s[:, np.newaxis, np.newaxis])
+        assert_array_equal(cl, expected_l[:, np.newaxis, np.newaxis])
+        # Sanity check on innerwt.
+        a = np.arange(6).reshape((2, 3))
+        b = np.arange(10, 16).reshape((2, 3))
+        w = np.arange(20, 26).reshape((2, 3))
+        assert_array_equal(umt.innerwt(a, b, w, keepdims=True),
+                           np.sum(a * b * w, axis=-1, keepdims=True))
+        # Check errors.
+        # Not a boolean
+        assert_raises(TypeError, inner1d, a, b, keepdims='true')
+        # 1 core dimension only.
+        mm = umt.matrix_multiply
+        assert_raises(TypeError, mm, a, b, keepdims=True)
+        assert_raises(TypeError, mm, a, b, keepdims=False)
+
     def test_innerwt(self):
         a = np.arange(6).reshape((2, 3))
         b = np.arange(10, 16).reshape((2, 3))
@@ -892,13 +969,6 @@ class TestUfunc(object):
         np.add.reduceat(arr, np.arange(4), out=arr, axis=-1)
         assert_array_equal(arr, out)
 
-    def test_object_scalar_multiply(self):
-        # Tickets #2469 and #4482
-        arr = np.matrix([1, 2], dtype=object)
-        desired = np.matrix([[3, 6]], dtype=object)
-        assert_equal(np.multiply(arr, 3), desired)
-        assert_equal(np.multiply(3, arr), desired)
-
     def test_zerosize_reduction(self):
         # Test with default dtype and object dtype
         for a in [[], np.array([], dtype=object)]:
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 1464a9e9a..93ec73094 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -1675,13 +1675,16 @@ class TestSpecialMethods(object):
         assert_equal(ncu.maximum(a, C()), 0)
 
     def test_ufunc_override(self):
-
+        # check override works even with instance with high priority.
         class A(object):
             def __array_ufunc__(self, func, method, *inputs, **kwargs):
                 return self, func, method, inputs, kwargs
 
+        class MyNDArray(np.ndarray):
+            __array_priority__ = 100
+
         a = A()
-        b = np.matrix([1])
+        b = np.array([1]).view(MyNDArray)
         res0 = np.multiply(a, b)
         res1 = np.multiply(b, b, out=a)
 
@@ -1937,6 +1940,7 @@ class TestSpecialMethods(object):
         # outer, wrong args
         assert_raises(TypeError, np.multiply.outer, a)
         assert_raises(TypeError, np.multiply.outer, a, a, a, a)
+        assert_raises(TypeError, np.multiply.outer, a, a, sig='a', signature='a')
 
         # at
         res = np.multiply.at(a, [4, 2], 'b0')
@@ -2691,7 +2695,7 @@ def test_nextafterf():
 
 @pytest.mark.skipif(np.finfo(np.double) == np.finfo(np.longdouble),
                     reason="long double is same as double")
-@pytest.mark.skipif(platform.machine().startswith("ppc64"),
+@pytest.mark.xfail(condition=platform.machine().startswith("ppc64"),
                     reason="IBM double double")
 def test_nextafterl():
     return _test_nextafter(np.longdouble)
@@ -2724,7 +2728,7 @@ def test_spacingf():
 
 @pytest.mark.skipif(np.finfo(np.double) == np.finfo(np.longdouble),
                     reason="long double is same as double")
-@pytest.mark.skipif(platform.machine().startswith("ppc64"),
+@pytest.mark.xfail(condition=platform.machine().startswith("ppc64"),
                     reason="IBM double double")
 def test_spacingl():
     return _test_spacing(np.longdouble)
diff --git a/numpy/distutils/misc_util.py b/numpy/distutils/misc_util.py
index cb7414a04..41f0b1f61 100644
--- a/numpy/distutils/misc_util.py
+++ b/numpy/distutils/misc_util.py
@@ -256,6 +256,11 @@ def minrelpath(path):
         return ''
     return os.sep.join(l)
 
+def sorted_glob(fileglob):
+    """sorts output of python glob for http://bugs.python.org/issue30461
+    to allow extensions to have reproducible build results"""
+    return sorted(glob.glob(fileglob))
+
 def _fix_paths(paths, local_path, include_non_existing):
     assert is_sequence(paths), repr(type(paths))
     new_paths = []
@@ -263,8 +268,8 @@ def _fix_paths(paths, local_path, include_non_existing):
     for n in paths:
         if is_string(n):
             if '*' in n or '?' in n:
-                p = glob.glob(n)
-                p2 = glob.glob(njoin(local_path, n))
+                p = sorted_glob(n)
+                p2 = sorted_glob(njoin(local_path, n))
                 if p2:
                     new_paths.extend(p2)
                 elif p:
@@ -528,7 +533,7 @@ def _get_headers(directory_list):
     # get *.h files from list of directories
     headers = []
     for d in directory_list:
-        head = glob.glob(os.path.join(d, "*.h")) #XXX: *.hpp files??
+        head = sorted_glob(os.path.join(d, "*.h")) #XXX: *.hpp files??
         headers.extend(head)
     return headers
 
@@ -882,7 +887,7 @@ class Configuration(object):
                                  caller_level = 1):
         l = subpackage_name.split('.')
         subpackage_path = njoin([self.local_path]+l)
-        dirs = [_m for _m in glob.glob(subpackage_path) if os.path.isdir(_m)]
+        dirs = [_m for _m in sorted_glob(subpackage_path) if os.path.isdir(_m)]
         config_list = []
         for d in dirs:
             if not os.path.isfile(njoin(d, '__init__.py')):
diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py
index 27143e5c6..b604b8c52 100644
--- a/numpy/lib/_iotools.py
+++ b/numpy/lib/_iotools.py
@@ -205,7 +205,11 @@ class LineSplitter(object):
     #
 
     def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None):
+        delimiter = _decode_line(delimiter)
+        comments = _decode_line(comments)
+
         self.comments = comments
+
         # Delimiter is a character
         if (delimiter is None) or isinstance(delimiter, basestring):
             delimiter = delimiter or None
diff --git a/numpy/lib/arraypad.py b/numpy/lib/arraypad.py
index 600301c56..e9ca9de4d 100644
--- a/numpy/lib/arraypad.py
+++ b/numpy/lib/arraypad.py
@@ -74,6 +74,35 @@ def _round_ifneeded(arr, dtype):
         arr.round(out=arr)
 
 
+def _slice_at_axis(shape, sl, axis):
+    """
+    Construct a slice tuple the length of shape, with sl at the specified axis
+    """
+    slice_tup = (slice(None),)
+    return slice_tup * axis + (sl,) + slice_tup * (len(shape) - axis - 1)
+
+
+def _slice_first(shape, n, axis):
+    """ Construct a slice tuple to take the first n elements along axis """
+    return _slice_at_axis(shape, slice(0, n), axis=axis)
+
+
+def _slice_last(shape, n, axis):
+    """ Construct a slice tuple to take the last n elements along axis """
+    dim = shape[axis]  # doing this explicitly makes n=0 work
+    return _slice_at_axis(shape, slice(dim - n, dim), axis=axis)
+
+
+def _do_prepend(arr, pad_chunk, axis):
+    return np.concatenate(
+        (pad_chunk.astype(arr.dtype, copy=False), arr), axis=axis)
+
+
+def _do_append(arr, pad_chunk, axis):
+    return np.concatenate(
+        (arr, pad_chunk.astype(arr.dtype, copy=False)), axis=axis)
+
+
 def _prepend_const(arr, pad_amt, val, axis=-1):
     """
     Prepend constant `val` along `axis` of `arr`.
@@ -100,8 +129,7 @@ def _prepend_const(arr, pad_amt, val, axis=-1):
         return arr
     padshape = tuple(x if i != axis else pad_amt
                      for (i, x) in enumerate(arr.shape))
-    return np.concatenate((np.full(padshape, val, dtype=arr.dtype), arr),
-                          axis=axis)
+    return _do_prepend(arr, np.full(padshape, val, dtype=arr.dtype), axis)
 
 
 def _append_const(arr, pad_amt, val, axis=-1):
@@ -130,8 +158,8 @@ def _append_const(arr, pad_amt, val, axis=-1):
         return arr
     padshape = tuple(x if i != axis else pad_amt
                      for (i, x) in enumerate(arr.shape))
-    return np.concatenate((arr, np.full(padshape, val, dtype=arr.dtype)),
-                          axis=axis)
+    return _do_append(arr, np.full(padshape, val, dtype=arr.dtype), axis)
+
 
 
 def _prepend_edge(arr, pad_amt, axis=-1):
@@ -156,11 +184,9 @@ def _prepend_edge(arr, pad_amt, axis=-1):
     if pad_amt == 0:
         return arr
 
-    edge_slice = tuple(slice(None) if i != axis else slice(0, 1)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_first(arr.shape, 1, axis=axis)
     edge_arr = arr[edge_slice]
-    return np.concatenate((edge_arr.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, edge_arr.repeat(pad_amt, axis=axis), axis)
 
 
 def _append_edge(arr, pad_amt, axis=-1):
@@ -186,11 +212,9 @@ def _append_edge(arr, pad_amt, axis=-1):
     if pad_amt == 0:
         return arr
 
-    edge_slice = tuple(slice(None) if i != axis else slice(x - 1, x)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_last(arr.shape, 1, axis=axis)
     edge_arr = arr[edge_slice]
-    return np.concatenate((arr, edge_arr.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, edge_arr.repeat(pad_amt, axis=axis), axis)
 
 
 def _prepend_ramp(arr, pad_amt, end, axis=-1):
@@ -228,8 +252,7 @@ def _prepend_ramp(arr, pad_amt, end, axis=-1):
                                reverse=True).astype(np.float64)
 
     # Appropriate slicing to extract n-dimensional edge along `axis`
-    edge_slice = tuple(slice(None) if i != axis else slice(0, 1)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_first(arr.shape, 1, axis=axis)
 
     # Extract edge, and extend along `axis`
     edge_pad = arr[edge_slice].repeat(pad_amt, axis)
@@ -241,7 +264,7 @@ def _prepend_ramp(arr, pad_amt, end, axis=-1):
     _round_ifneeded(ramp_arr, arr.dtype)
 
     # Ramp values will most likely be float, cast them to the same type as arr
-    return np.concatenate((ramp_arr.astype(arr.dtype), arr), axis=axis)
+    return _do_prepend(arr, ramp_arr, axis)
 
 
 def _append_ramp(arr, pad_amt, end, axis=-1):
@@ -279,8 +302,7 @@ def _append_ramp(arr, pad_amt, end, axis=-1):
                                reverse=False).astype(np.float64)
 
     # Slice a chunk from the edge to calculate stats on
-    edge_slice = tuple(slice(None) if i != axis else slice(x - 1, x)
-                       for (i, x) in enumerate(arr.shape))
+    edge_slice = _slice_last(arr.shape, 1, axis=axis)
 
     # Extract edge, and extend along `axis`
     edge_pad = arr[edge_slice].repeat(pad_amt, axis)
@@ -292,7 +314,7 @@ def _append_ramp(arr, pad_amt, end, axis=-1):
     _round_ifneeded(ramp_arr, arr.dtype)
 
     # Ramp values will most likely be float, cast them to the same type as arr
-    return np.concatenate((arr, ramp_arr.astype(arr.dtype)), axis=axis)
+    return _do_append(arr, ramp_arr, axis)
 
 
 def _prepend_max(arr, pad_amt, num, axis=-1):
@@ -332,15 +354,13 @@ def _prepend_max(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    max_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    max_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate max
     max_chunk = arr[max_slice].max(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `max_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((max_chunk.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, max_chunk.repeat(pad_amt, axis=axis), axis)
 
 
 def _append_max(arr, pad_amt, num, axis=-1):
@@ -379,11 +399,8 @@ def _append_max(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        max_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        max_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         max_slice = tuple(slice(None) for x in arr.shape)
 
@@ -391,8 +408,7 @@ def _append_max(arr, pad_amt, num, axis=-1):
     max_chunk = arr[max_slice].max(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `max_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((arr, max_chunk.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, max_chunk.repeat(pad_amt, axis=axis), axis)
 
 
 def _prepend_mean(arr, pad_amt, num, axis=-1):
@@ -431,16 +447,14 @@ def _prepend_mean(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    mean_slice = tuple(slice(None) if i != axis else slice(num)
-                       for (i, x) in enumerate(arr.shape))
+    mean_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate mean
     mean_chunk = arr[mean_slice].mean(axis, keepdims=True)
     _round_ifneeded(mean_chunk, arr.dtype)
 
     # Concatenate `arr` with `mean_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((mean_chunk.repeat(pad_amt, axis).astype(arr.dtype),
-                           arr), axis=axis)
+    return _do_prepend(arr, mean_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_mean(arr, pad_amt, num, axis=-1):
@@ -479,11 +493,8 @@ def _append_mean(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        mean_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        mean_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         mean_slice = tuple(slice(None) for x in arr.shape)
 
@@ -492,8 +503,7 @@ def _append_mean(arr, pad_amt, num, axis=-1):
     _round_ifneeded(mean_chunk, arr.dtype)
 
     # Concatenate `arr` with `mean_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (arr, mean_chunk.repeat(pad_amt, axis).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, mean_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _prepend_med(arr, pad_amt, num, axis=-1):
@@ -532,16 +542,14 @@ def _prepend_med(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    med_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    med_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate median
     med_chunk = np.median(arr[med_slice], axis=axis, keepdims=True)
     _round_ifneeded(med_chunk, arr.dtype)
 
     # Concatenate `arr` with `med_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (med_chunk.repeat(pad_amt, axis).astype(arr.dtype), arr), axis=axis)
+    return _do_prepend(arr, med_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_med(arr, pad_amt, num, axis=-1):
@@ -580,11 +588,8 @@ def _append_med(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        med_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        med_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         med_slice = tuple(slice(None) for x in arr.shape)
 
@@ -593,8 +598,7 @@ def _append_med(arr, pad_amt, num, axis=-1):
     _round_ifneeded(med_chunk, arr.dtype)
 
     # Concatenate `arr` with `med_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate(
-        (arr, med_chunk.repeat(pad_amt, axis).astype(arr.dtype)), axis=axis)
+    return _do_append(arr, med_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _prepend_min(arr, pad_amt, num, axis=-1):
@@ -634,15 +638,13 @@ def _prepend_min(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    min_slice = tuple(slice(None) if i != axis else slice(num)
-                      for (i, x) in enumerate(arr.shape))
+    min_slice = _slice_first(arr.shape, num, axis=axis)
 
     # Extract slice, calculate min
     min_chunk = arr[min_slice].min(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `min_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((min_chunk.repeat(pad_amt, axis=axis), arr),
-                          axis=axis)
+    return _do_prepend(arr, min_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _append_min(arr, pad_amt, num, axis=-1):
@@ -681,11 +683,8 @@ def _append_min(arr, pad_amt, num, axis=-1):
             num = None
 
     # Slice a chunk from the edge to calculate stats on
-    end = arr.shape[axis] - 1
     if num is not None:
-        min_slice = tuple(
-            slice(None) if i != axis else slice(end, end - num, -1)
-            for (i, x) in enumerate(arr.shape))
+        min_slice = _slice_last(arr.shape, num, axis=axis)
     else:
         min_slice = tuple(slice(None) for x in arr.shape)
 
@@ -693,8 +692,7 @@ def _append_min(arr, pad_amt, num, axis=-1):
     min_chunk = arr[min_slice].min(axis=axis, keepdims=True)
 
     # Concatenate `arr` with `min_chunk`, extended along `axis` by `pad_amt`
-    return np.concatenate((arr, min_chunk.repeat(pad_amt, axis=axis)),
-                          axis=axis)
+    return _do_append(arr, min_chunk.repeat(pad_amt, axis), axis=axis)
 
 
 def _pad_ref(arr, pad_amt, method, axis=-1):
@@ -737,15 +735,13 @@ def _pad_ref(arr, pad_amt, method, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    ref_slice = tuple(slice(None) if i != axis else slice(pad_amt[0], 0, -1)
-                      for (i, x) in enumerate(arr.shape))
+    ref_slice = _slice_at_axis(arr.shape, slice(pad_amt[0], 0, -1), axis=axis)
 
     ref_chunk1 = arr[ref_slice]
 
     # Memory/computationally more expensive, only do this if `method='odd'`
     if 'odd' in method and pad_amt[0] > 0:
-        edge_slice1 = tuple(slice(None) if i != axis else slice(0, 1)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice1 = _slice_first(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice1]
         ref_chunk1 = 2 * edge_chunk - ref_chunk1
         del edge_chunk
@@ -756,15 +752,12 @@ def _pad_ref(arr, pad_amt, method, axis=-1):
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
     start = arr.shape[axis] - pad_amt[1] - 1
     end = arr.shape[axis] - 1
-    ref_slice = tuple(slice(None) if i != axis else slice(start, end)
-                      for (i, x) in enumerate(arr.shape))
-    rev_idx = tuple(slice(None) if i != axis else slice(None, None, -1)
-                    for (i, x) in enumerate(arr.shape))
+    ref_slice = _slice_at_axis(arr.shape, slice(start, end), axis=axis)
+    rev_idx = _slice_at_axis(arr.shape, slice(None, None, -1), axis=axis)
     ref_chunk2 = arr[ref_slice][rev_idx]
 
     if 'odd' in method:
-        edge_slice2 = tuple(slice(None) if i != axis else slice(x - 1, x)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice2 = _slice_last(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice2]
         ref_chunk2 = 2 * edge_chunk - ref_chunk2
         del edge_chunk
@@ -813,16 +806,13 @@ def _pad_sym(arr, pad_amt, method, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    sym_slice = tuple(slice(None) if i != axis else slice(0, pad_amt[0])
-                      for (i, x) in enumerate(arr.shape))
-    rev_idx = tuple(slice(None) if i != axis else slice(None, None, -1)
-                    for (i, x) in enumerate(arr.shape))
+    sym_slice = _slice_first(arr.shape, pad_amt[0], axis=axis)
+    rev_idx = _slice_at_axis(arr.shape, slice(None, None, -1), axis=axis)
     sym_chunk1 = arr[sym_slice][rev_idx]
 
     # Memory/computationally more expensive, only do this if `method='odd'`
     if 'odd' in method and pad_amt[0] > 0:
-        edge_slice1 = tuple(slice(None) if i != axis else slice(0, 1)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice1 = _slice_first(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice1]
         sym_chunk1 = 2 * edge_chunk - sym_chunk1
         del edge_chunk
@@ -831,15 +821,11 @@ def _pad_sym(arr, pad_amt, method, axis=-1):
     # Appended region
 
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
-    start = arr.shape[axis] - pad_amt[1]
-    end = arr.shape[axis]
-    sym_slice = tuple(slice(None) if i != axis else slice(start, end)
-                      for (i, x) in enumerate(arr.shape))
+    sym_slice = _slice_last(arr.shape, pad_amt[1], axis=axis)
     sym_chunk2 = arr[sym_slice][rev_idx]
 
     if 'odd' in method:
-        edge_slice2 = tuple(slice(None) if i != axis else slice(x - 1, x)
-                            for (i, x) in enumerate(arr.shape))
+        edge_slice2 = _slice_last(arr.shape, 1, axis=axis)
         edge_chunk = arr[edge_slice2]
         sym_chunk2 = 2 * edge_chunk - sym_chunk2
         del edge_chunk
@@ -885,18 +871,14 @@ def _pad_wrap(arr, pad_amt, axis=-1):
     # Prepended region
 
     # Slice off a reverse indexed chunk from near edge to pad `arr` before
-    start = arr.shape[axis] - pad_amt[0]
-    end = arr.shape[axis]
-    wrap_slice = tuple(slice(None) if i != axis else slice(start, end)
-                       for (i, x) in enumerate(arr.shape))
+    wrap_slice = _slice_last(arr.shape, pad_amt[0], axis=axis)
     wrap_chunk1 = arr[wrap_slice]
 
     ##########################################################################
     # Appended region
 
     # Slice off a reverse indexed chunk from far edge to pad `arr` after
-    wrap_slice = tuple(slice(None) if i != axis else slice(0, pad_amt[1])
-                       for (i, x) in enumerate(arr.shape))
+    wrap_slice = _slice_first(arr.shape, pad_amt[1], axis=axis)
     wrap_chunk2 = arr[wrap_slice]
 
     # Concatenate `arr` with both chunks, extending along `axis`
diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py
index e8eda297f..4d3f35183 100644
--- a/numpy/lib/arraysetops.py
+++ b/numpy/lib/arraysetops.py
@@ -298,7 +298,7 @@ def _unique1d(ar, return_index=False, return_inverse=False,
     return ret
 
 
-def intersect1d(ar1, ar2, assume_unique=False):
+def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
     """
     Find the intersection of two arrays.
 
@@ -307,15 +307,28 @@ def intersect1d(ar1, ar2, assume_unique=False):
     Parameters
     ----------
     ar1, ar2 : array_like
-        Input arrays.
+        Input arrays. Will be flattened if not already 1D.
     assume_unique : bool
         If True, the input arrays are both assumed to be unique, which
         can speed up the calculation.  Default is False.
-
+    return_indices : bool
+        If True, the indices which correspond to the intersection of the 
+        two arrays are returned. The first instance of a value is used 
+        if there are multiple. Default is False. 
+    
+        .. versionadded:: 1.15.0    
+        
     Returns
     -------
     intersect1d : ndarray
         Sorted 1D array of common and unique elements.
+    comm1 : ndarray
+        The indices of the first occurrences of the common values in `ar1`.
+        Only provided if `return_indices` is True.
+    comm2 : ndarray
+        The indices of the first occurrences of the common values in `ar2`. 
+        Only provided if `return_indices` is True.
+
 
     See Also
     --------
@@ -332,14 +345,49 @@ def intersect1d(ar1, ar2, assume_unique=False):
     >>> from functools import reduce
     >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
     array([3])
+    
+    To return the indices of the values common to the input arrays
+    along with the intersected values:
+    >>> x = np.array([1, 1, 2, 3, 4])
+    >>> y = np.array([2, 1, 4, 6])
+    >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
+    >>> x_ind, y_ind
+    (array([0, 2, 4]), array([1, 0, 2]))
+    >>> xy, x[x_ind], y[y_ind]
+    (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
+    
     """
     if not assume_unique:
-        # Might be faster than unique( intersect1d( ar1, ar2 ) )?
-        ar1 = unique(ar1)
-        ar2 = unique(ar2)
+        if return_indices:
+            ar1, ind1 = unique(ar1, return_index=True)
+            ar2, ind2 = unique(ar2, return_index=True)
+        else:
+            ar1 = unique(ar1)
+            ar2 = unique(ar2)
+    else:
+        ar1 = ar1.ravel()
+        ar2 = ar2.ravel()
+        
     aux = np.concatenate((ar1, ar2))
-    aux.sort()
-    return aux[:-1][aux[1:] == aux[:-1]]
+    if return_indices:
+        aux_sort_indices = np.argsort(aux, kind='mergesort')
+        aux = aux[aux_sort_indices]
+    else:
+        aux.sort()
+
+    mask = aux[1:] == aux[:-1]
+    int1d = aux[:-1][mask]
+
+    if return_indices:
+        ar1_indices = aux_sort_indices[:-1][mask]
+        ar2_indices = aux_sort_indices[1:][mask] - ar1.size
+        if not assume_unique:
+            ar1_indices = ind1[ar1_indices]
+            ar2_indices = ind2[ar2_indices]
+
+        return int1d, ar1_indices, ar2_indices
+    else:
+        return int1d
 
 def setxor1d(ar1, ar2, assume_unique=False):
     """
@@ -660,3 +708,4 @@ def setdiff1d(ar1, ar2, assume_unique=False):
         ar1 = unique(ar1)
         ar2 = unique(ar2)
     return ar1[in1d(ar1, ar2, assume_unique=True, invert=True)]
+
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index afa154cc5..23eac7e7d 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -1,6 +1,8 @@
 """
-Binary Serialization
-====================
+Binary serialization
+
+NPY format
+==========
 
 A simple format for saving numpy arrays to disk with the full
 information about them.
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index 90e19769e..2922b3a86 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -877,12 +877,6 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
         # bins is an integer
         bins = D*[bins]
 
-    # avoid rounding issues for comparisons when dealing with inexact types
-    if np.issubdtype(sample.dtype, np.inexact):
-        edge_dt = sample.dtype
-    else:
-        edge_dt = float
-
     # normalize the range argument
     if range is None:
         range = (None,) * D
@@ -896,13 +890,12 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
                 raise ValueError(
                     '`bins[{}]` must be positive, when an integer'.format(i))
             smin, smax = _get_outer_edges(sample[:,i], range[i])
-            edges[i] = np.linspace(smin, smax, bins[i] + 1, dtype=edge_dt)
+            edges[i] = np.linspace(smin, smax, bins[i] + 1)
         elif np.ndim(bins[i]) == 1:
-            edges[i] = np.asarray(bins[i], edge_dt)
-            # not just monotonic, due to the use of mindiff below
-            if np.any(edges[i][:-1] >= edges[i][1:]):
+            edges[i] = np.asarray(bins[i])
+            if np.any(edges[i][:-1] > edges[i][1:]):
                 raise ValueError(
-                    '`bins[{}]` must be strictly increasing, when an array'
+                    '`bins[{}]` must be monotonically increasing, when an array'
                     .format(i))
         else:
             raise ValueError(
@@ -913,7 +906,8 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
 
     # Compute the bin number each sample falls into.
     Ncount = tuple(
-        np.digitize(sample[:, i], edges[i])
+        # avoid np.digitize to work around gh-11022
+        np.searchsorted(edges[i], sample[:, i], side='right')
         for i in _range(D)
     )
 
@@ -921,16 +915,10 @@ def histogramdd(sample, bins=10, range=None, normed=False, weights=None):
     # For the rightmost bin, we want values equal to the right edge to be
     # counted in the last bin, and not as an outlier.
     for i in _range(D):
-        # Rounding precision
-        mindiff = dedges[i].min()
-        if not np.isinf(mindiff):
-            decimal = int(-np.log10(mindiff)) + 6
-            # Find which points are on the rightmost edge.
-            not_smaller_than_edge = (sample[:, i] >= edges[i][-1])
-            on_edge = (np.around(sample[:, i], decimal) ==
-                       np.around(edges[i][-1], decimal))
-            # Shift these points one bin to the left.
-            Ncount[i][on_edge & not_smaller_than_edge] -= 1
+        # Find which points are on the rightmost edge.
+        on_edge = (sample[:, i] == edges[i][-1])
+        # Shift these points one bin to the left.
+        Ncount[i][on_edge] -= 1
 
     # Compute the sample indices in the flattened histogram matrix.
     # This raises an error if the array is too large.
diff --git a/numpy/lib/index_tricks.py b/numpy/lib/index_tricks.py
index 43fdc5627..d2139338e 100644
--- a/numpy/lib/index_tricks.py
+++ b/numpy/lib/index_tricks.py
@@ -201,7 +201,7 @@ class nd_grid(object):
                 slobj = [_nx.newaxis]*len(size)
                 for k in range(len(size)):
                     slobj[k] = slice(None, None)
-                    nn[k] = nn[k][slobj]
+                    nn[k] = nn[k][tuple(slobj)]
                     slobj[k] = _nx.newaxis
             return nn
         except (IndexError, TypeError):
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 5d3c1e525..390927601 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -26,9 +26,11 @@ from numpy.compat import (
 
 if sys.version_info[0] >= 3:
     import pickle
+    from collections.abc import Mapping
 else:
     import cPickle as pickle
     from future_builtins import map
+    from collections import Mapping
 
 
 def loads(*args, **kwargs):
@@ -92,7 +94,7 @@ class BagObj(object):
 
         This also enables tab-completion in an interpreter or IPython.
         """
-        return object.__getattribute__(self, '_obj').keys()
+        return list(object.__getattribute__(self, '_obj').keys())
 
 
 def zipfile_factory(file, *args, **kwargs):
@@ -110,7 +112,7 @@ def zipfile_factory(file, *args, **kwargs):
     return zipfile.ZipFile(file, *args, **kwargs)
 
 
-class NpzFile(object):
+class NpzFile(Mapping):
     """
     NpzFile(fid)
 
@@ -216,6 +218,13 @@ class NpzFile(object):
     def __del__(self):
         self.close()
 
+    # Implement the Mapping ABC
+    def __iter__(self):
+        return iter(self.files)
+
+    def __len__(self):
+        return len(self.files)
+
     def __getitem__(self, key):
         # FIXME: This seems like it will copy strings around
         #   more than is strictly necessary.  The zipfile
@@ -225,11 +234,11 @@ class NpzFile(object):
         #   It would be better if the zipfile could read
         #   (or at least uncompress) the data
         #   directly into the array memory.
-        member = 0
+        member = False
         if key in self._files:
-            member = 1
+            member = True
         elif key in self.files:
-            member = 1
+            member = True
             key += '.npy'
         if member:
             bytes = self.zip.open(key)
@@ -245,31 +254,27 @@ class NpzFile(object):
         else:
             raise KeyError("%s is not a file in the archive" % key)
 
-    def __iter__(self):
-        return iter(self.files)
 
-    def items(self):
-        """
-        Return a list of tuples, with each tuple (filename, array in file).
+    if sys.version_info.major == 3:
+        # deprecate the python 2 dict apis that we supported by accident in
+        # python 3. We forgot to implement itervalues() at all in earlier
+        # versions of numpy, so no need to deprecated it here.
 
-        """
-        return [(f, self[f]) for f in self.files]
-
-    def iteritems(self):
-        """Generator that returns tuples (filename, array in file)."""
-        for f in self.files:
-            yield (f, self[f])
-
-    def keys(self):
-        """Return files in the archive with a ``.npy`` extension."""
-        return self.files
-
-    def iterkeys(self):
-        """Return an iterator over the files in the archive."""
-        return self.__iter__()
+        def iteritems(self):
+            # Numpy 1.15, 2018-02-20
+            warnings.warn(
+                "NpzFile.iteritems is deprecated in python 3, to match the "
+                "removal of dict.itertems. Use .items() instead.",
+                DeprecationWarning, stacklevel=2)
+            return self.items()
 
-    def __contains__(self, key):
-        return self.files.__contains__(key)
+        def iterkeys(self):
+            # Numpy 1.15, 2018-02-20
+            warnings.warn(
+                "NpzFile.iterkeys is deprecated in python 3, to match the "
+                "removal of dict.iterkeys. Use .keys() instead.",
+                DeprecationWarning, stacklevel=2)
+            return self.keys()
 
 
 def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
@@ -475,7 +480,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
 
     Notes
     -----
-    For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`
+    For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
 
     Examples
     --------
@@ -559,7 +564,7 @@ def savez(file, *args, **kwds):
     The ``.npz`` file format is a zipped archive of files named after the
     variables they contain.  The archive is not compressed and each file
     in the archive contains one variable in ``.npy`` format. For a
-    description of the ``.npy`` format, see :py:mod:`numpy.lib.format`
+    description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
 
     When opening the saved ``.npz`` file with `load` a `NpzFile` object is
     returned. This is a dictionary-like object which can be queried for
@@ -639,7 +644,7 @@ def savez_compressed(file, *args, **kwds):
     variables they contain.  The archive is compressed with
     ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable
     in ``.npy`` format. For a description of the ``.npy`` format, see 
-    :py:mod:`numpy.lib.format`
+    :py:mod:`numpy.lib.format`.
 
 
     When opening the saved ``.npz`` file with `load` a `NpzFile` object is
diff --git a/numpy/lib/polynomial.py b/numpy/lib/polynomial.py
index 41b5e2f64..078608bbb 100644
--- a/numpy/lib/polynomial.py
+++ b/numpy/lib/polynomial.py
@@ -113,11 +113,6 @@ def poly(seq_of_zeros):
     >>> np.poly(P)
     array([ 1.        ,  0.        ,  0.16666667])
 
-    Or a square matrix object:
-
-    >>> np.poly(np.matrix(P))
-    array([ 1.        ,  0.        ,  0.16666667])
-
     Note how in all cases the leading coefficient is always 1.
 
     """
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index e9ba38f46..c455bd93f 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -397,12 +397,13 @@ def merge_arrays(seqarrays, fill_value=-1, flatten=False,
     Notes
     -----
     * Without a mask, the missing value will be filled with something,
-    * depending on what its corresponding type:
-            -1      for integers
-            -1.0    for floating point numbers
-            '-'     for characters
-            '-1'    for strings
-            True    for boolean values
+      depending on what its corresponding type:
+
+      * ``-1``      for integers
+      * ``-1.0``    for floating point numbers
+      * ``'-'``     for characters
+      * ``'-1'``    for strings
+      * ``True``    for boolean values
     * XXX: I just obtained these values empirically
     """
     # Only one item in the input sequence ?
diff --git a/numpy/lib/scimath.py b/numpy/lib/scimath.py
index e07caf805..f1838fee6 100644
--- a/numpy/lib/scimath.py
+++ b/numpy/lib/scimath.py
@@ -555,7 +555,7 @@ def arctanh(x):
     --------
     >>> np.set_printoptions(precision=4)
 
-    >>> np.emath.arctanh(np.matrix(np.eye(2)))
+    >>> np.emath.arctanh(np.eye(2))
     array([[ Inf,   0.],
            [  0.,  Inf]])
     >>> np.emath.arctanh([1j])
diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py
index 41ef28ef3..65104115a 100644
--- a/numpy/lib/shape_base.py
+++ b/numpy/lib/shape_base.py
@@ -16,10 +16,235 @@ from numpy.matrixlib.defmatrix import matrix  # this raises all the right alarm
 __all__ = [
     'column_stack', 'row_stack', 'dstack', 'array_split', 'split',
     'hsplit', 'vsplit', 'dsplit', 'apply_over_axes', 'expand_dims',
-    'apply_along_axis', 'kron', 'tile', 'get_array_wrap'
+    'apply_along_axis', 'kron', 'tile', 'get_array_wrap', 'take_along_axis',
+    'put_along_axis'
     ]
 
 
+def _make_along_axis_idx(arr_shape, indices, axis):
+	# compute dimensions to iterate over
+    if not _nx.issubdtype(indices.dtype, _nx.integer):
+        raise IndexError('`indices` must be an integer array')
+    if len(arr_shape) != indices.ndim:
+        raise ValueError(
+            "`indices` and `arr` must have the same number of dimensions")
+    shape_ones = (1,) * indices.ndim
+    dest_dims = list(range(axis)) + [None] + list(range(axis+1, indices.ndim))
+
+    # build a fancy index, consisting of orthogonal aranges, with the
+    # requested index inserted at the right location
+    fancy_index = []
+    for dim, n in zip(dest_dims, arr_shape):
+        if dim is None:
+            fancy_index.append(indices)
+        else:
+            ind_shape = shape_ones[:dim] + (-1,) + shape_ones[dim+1:]
+            fancy_index.append(_nx.arange(n).reshape(ind_shape))
+
+    return tuple(fancy_index)
+
+
+def take_along_axis(arr, indices, axis):
+    """
+    Take values from the input array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to look up values in the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Source array
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to take along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions Ni and Nj only need to broadcast
+        against `arr`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the input array is
+        treated as if it had first been flattened to 1d, for consistency with
+        `sort` and `argsort`.
+
+    Returns
+    -------
+    out: ndarray (Ni..., J, Nk...)
+        The indexed result.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+        out = np.empty(Nk + (J,) + Nk)
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                out_1d     = out    [ii + s_[:,] + kk]
+                for j in range(J):
+                    out_1d[j] = a_1d[indices_1d[j]]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                out_1d[:] = a_1d[indices_1d]
+
+    See Also
+    --------
+    take : Take along an axis, using the same indices for every 1d slice
+    put_along_axis :
+        Put values into the destination array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can sort either by using sort directly, or argsort and this function
+
+    >>> np.sort(a, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+    >>> ai = np.argsort(a, axis=1); ai
+    array([[0, 2, 1],
+           [1, 2, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 20, 30],
+           [40, 50, 60]])
+
+    The same works for max and min, if you expand the dimensions:
+
+    >>> np.expand_dims(np.max(a, axis=1), axis=1)
+    array([[30],
+           [60]])
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[30],
+           [60]])
+
+    If we want to get the max and min at the same time, we can stack the
+    indices first
+
+    >>> ai_min = np.expand_dims(np.argmin(a, axis=1), axis=1)
+    >>> ai_max = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai = np.concatenate([ai_min, ai_max], axis=axis)
+    >> ai
+    array([[0, 1],
+           [1, 0]], dtype=int64)
+    >>> np.take_along_axis(a, ai, axis=1)
+    array([[10, 30],
+           [40, 60]])
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        arr_shape = (len(arr),)  # flatiter has no .shape
+        axis = 0
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    return arr[_make_along_axis_idx(arr_shape, indices, axis)]
+
+
+def put_along_axis(arr, indices, values, axis):
+    """
+    Put values into the destination array by matching 1d index and data slices.
+
+    This iterates over matching 1d slices oriented along the specified axis in
+    the index and data arrays, and uses the former to place values into the
+    latter. These slices can be different lengths.
+
+    Functions returning an index along an axis, like `argsort` and
+    `argpartition`, produce suitable indices for this function.
+
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    arr: ndarray (Ni..., M, Nk...)
+        Destination array.
+    indices: ndarray (Ni..., J, Nk...)
+        Indices to change along each 1d slice of `arr`. This must match the
+        dimension of arr, but dimensions in Ni and Nj may be 1 to broadcast
+        against `arr`.
+    values: array_like (Ni..., J, Nk...)
+        values to insert at those indices. Its shape and dimension are
+        broadcast to match that of `indices`.
+    axis: int
+        The axis to take 1d slices along. If axis is None, the destination
+        array is treated as if a flattened 1d view had been created of it.
+
+    Notes
+    -----
+    This is equivalent to (but faster than) the following use of `ndindex` and
+    `s_`, which sets each of ``ii`` and ``kk`` to a tuple of indices::
+
+        Ni, M, Nk = a.shape[:axis], a.shape[axis], a.shape[axis+1:]
+        J = indices.shape[axis]  # Need not equal M
+
+        for ii in ndindex(Ni):
+            for kk in ndindex(Nk):
+                a_1d       = a      [ii + s_[:,] + kk]
+                indices_1d = indices[ii + s_[:,] + kk]
+                values_1d  = values [ii + s_[:,] + kk]
+                for j in range(J):
+                    a_1d[indices_1d[j]] = values_1d[j]
+
+    Equivalently, eliminating the inner loop, the last two lines would be::
+
+                a_1d[indices_1d] = values_1d
+
+    See Also
+    --------
+    take_along_axis :
+        Take values from the input array by matching 1d index and data slices
+
+    Examples
+    --------
+
+    For this sample array
+
+    >>> a = np.array([[10, 30, 20], [60, 40, 50]])
+
+    We can replace the maximum values with:
+
+    >>> ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
+    >>> ai
+    array([[1],
+           [0]], dtype=int64)
+    >>> np.put_along_axis(a, ai, 99, axis=1)
+    >>> a
+    array([[10, 99, 20],
+           [99, 40, 50]])
+
+    """
+    # normalize inputs
+    if axis is None:
+        arr = arr.flat
+        axis = 0
+        arr_shape = (len(arr),)  # flatiter has no .shape
+    else:
+        axis = normalize_axis_index(axis, arr.ndim)
+        arr_shape = arr.shape
+
+    # use the fancy index
+    arr[_make_along_axis_idx(arr_shape, indices, axis)] = values
+
+
 def apply_along_axis(func1d, axis, arr, *args, **kwargs):
     """
     Apply a function to 1-D slices along the given axis.
diff --git a/numpy/lib/tests/test__iotools.py b/numpy/lib/tests/test__iotools.py
index 5f6c29a4d..b4888f1bd 100644
--- a/numpy/lib/tests/test__iotools.py
+++ b/numpy/lib/tests/test__iotools.py
@@ -53,6 +53,11 @@ class TestLineSplitter(object):
         test = LineSplitter(',')(strg)
         assert_equal(test, ['1', '2', '3', '4', '', '5'])
 
+        # gh-11028 bytes comment/delimiters should get encoded
+        strg = b" 1,2,3,4,,5 % test"
+        test = LineSplitter(delimiter=b',', comments=b'%')(strg)
+        assert_equal(test, ['1', '2', '3', '4', '', '5'])
+
     def test_constant_fixed_width(self):
         "Test LineSplitter w/ fixed-width fields"
         strg = "  1  2  3  4     5   # test"
diff --git a/numpy/lib/tests/test_arraysetops.py b/numpy/lib/tests/test_arraysetops.py
index 76c36c53e..dace5ade8 100644
--- a/numpy/lib/tests/test_arraysetops.py
+++ b/numpy/lib/tests/test_arraysetops.py
@@ -32,7 +32,46 @@ class TestSetOps(object):
         assert_array_equal(c, ed)
 
         assert_array_equal([], intersect1d([], []))
-
+        
+    def test_intersect1d_indices(self):
+        # unique inputs
+        a = np.array([1, 2, 3, 4]) 
+        b = np.array([2, 1, 4, 6])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ee = np.array([1, 2, 4])
+        assert_array_equal(c, ee)
+        assert_array_equal(a[i1], ee)
+        assert_array_equal(b[i2], ee)
+        
+        # non-unique inputs
+        a = np.array([1, 2, 2, 3, 4, 3, 2])
+        b = np.array([1, 8, 4, 2, 2, 3, 2, 3])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ef = np.array([1, 2, 3, 4])
+        assert_array_equal(c, ef)
+        assert_array_equal(a[i1], ef)
+        assert_array_equal(b[i2], ef)
+                
+        # non1d, unique inputs
+        a = np.array([[2, 4, 5, 6], [7, 8, 1, 15]])
+        b = np.array([[3, 2, 7, 6], [10, 12, 8, 9]])
+        c, i1, i2 = intersect1d(a, b, assume_unique=True, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 6, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+    
+        # non1d, not assumed to be uniqueinputs
+        a = np.array([[2, 4, 5, 6, 6], [4, 7, 8, 7, 2]])
+        b = np.array([[3, 2, 7, 7], [10, 12, 8, 7]])
+        c, i1, i2 = intersect1d(a, b, return_indices=True)
+        ui1 = np.unravel_index(i1, a.shape)
+        ui2 = np.unravel_index(i2, b.shape)
+        ea = np.array([2, 7, 8])
+        assert_array_equal(ea, a[ui1])
+        assert_array_equal(ea, b[ui2])
+        
     def test_setxor1d(self):
         a = np.array([5, 7, 1, 2])
         b = np.array([2, 4, 3, 1, 5])
@@ -74,8 +113,6 @@ class TestSetOps(object):
         assert_array_equal([1,7,8], ediff1d(two_elem, to_end=[7,8]))
         assert_array_equal([7,1], ediff1d(two_elem, to_begin=7))
         assert_array_equal([5,6,1], ediff1d(two_elem, to_begin=[5,6]))
-        assert(isinstance(ediff1d(np.matrix(1)), np.matrix))
-        assert(isinstance(ediff1d(np.matrix(1), to_begin=1), np.matrix))
 
     def test_isin(self):
         # the tests for in1d cover most of isin's behavior
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 5dc96775b..4103a9eb3 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -287,9 +287,6 @@ class TestAverage(object):
         assert_almost_equal(y5.mean(0), average(y5, 0))
         assert_almost_equal(y5.mean(1), average(y5, 1))
 
-        y6 = np.matrix(rand(5, 5))
-        assert_array_equal(y6.mean(0), average(y6, 0))
-
     def test_weights(self):
         y = np.arange(10)
         w = np.arange(10)
@@ -357,14 +354,6 @@ class TestAverage(object):
         assert_equal(type(np.average(a)), subclass)
         assert_equal(type(np.average(a, weights=w)), subclass)
 
-        # also test matrices
-        a = np.matrix([[1,2],[3,4]])
-        w = np.matrix([[1,2],[3,4]])
-
-        r = np.average(a, axis=0, weights=w)
-        assert_equal(type(r), np.matrix)
-        assert_equal(r, [[2.5, 10.0/3]])
-
     def test_upcasting(self):
         types = [('i4', 'i4', 'f8'), ('i4', 'f4', 'f8'), ('f4', 'i4', 'f8'),
                  ('f4', 'f4', 'f4'), ('f4', 'f8', 'f8')]
@@ -1623,16 +1612,6 @@ class TestTrapz(object):
         xm = np.ma.array(x, mask=mask)
         assert_almost_equal(trapz(y, xm), r)
 
-    def test_matrix(self):
-        # Test to make sure matrices give the same answer as ndarrays
-        x = np.linspace(0, 5)
-        y = x * x
-        r = trapz(y, x)
-        mx = np.matrix(x)
-        my = np.matrix(y)
-        mr = trapz(my, mx)
-        assert_almost_equal(mr, r)
-
 
 class TestSinc(object):
 
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index 597b5b376..e16ae12c2 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -613,8 +613,6 @@ class TestHistogramdd(object):
         assert_raises(ValueError, np.histogramdd, x, bins=[-1, 2, 4, 5])
         assert_raises(ValueError, np.histogramdd, x, bins=[1, 0.99, 1, 1])
         assert_raises(
-            ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 2, 3]])
-        assert_raises(
             ValueError, np.histogramdd, x, bins=[1, 1, 1, [1, 2, 3, -3]])
         assert_(np.histogramdd(x, bins=[1, 1, 1, [1, 2, 3, 4]]))
 
@@ -646,7 +644,7 @@ class TestHistogramdd(object):
         bins = [[0., 0.5, 1.0]]
         hist, _ = histogramdd(x, bins=bins)
         assert_(hist[0] == 0.0)
-        assert_(hist[1] == 1.)
+        assert_(hist[1] == 0.0)
         x = [1.0001]
         bins = [[0., 0.5, 1.0]]
         hist, _ = histogramdd(x, bins=bins)
@@ -660,3 +658,40 @@ class TestHistogramdd(object):
                       range=[[0.0, 1.0], [0.25, 0.75], [0.25, np.inf]])
         assert_raises(ValueError, histogramdd, vals,
                       range=[[0.0, 1.0], [np.nan, 0.75], [0.25, 0.5]])
+
+    def test_equal_edges(self):
+        """ Test that adjacent entries in an edge array can be equal """
+        x = np.array([0, 1, 2])
+        y = np.array([0, 1, 2])
+        x_edges = np.array([0, 2, 2])
+        y_edges = 1
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        hist_expected = np.array([
+            [2.],
+            [1.],  # x == 2 falls in the final bin
+        ])
+        assert_equal(hist, hist_expected)
+
+    def test_edge_dtype(self):
+        """ Test that if an edge array is input, its type is preserved """
+        x = np.array([0, 10, 20])
+        y = x / 10
+        x_edges = np.array([0, 5, 15, 20])
+        y_edges = x_edges / 10
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(edges[0].dtype, x_edges.dtype)
+        assert_equal(edges[1].dtype, y_edges.dtype)
+
+    def test_large_integers(self):
+        big = 2**60  # Too large to represent with a full precision float
+
+        x = np.array([0], np.int64)
+        x_edges = np.array([-1, +1], np.int64)
+        y = big + x
+        y_edges = big + x_edges
+
+        hist, edges = histogramdd((x, y), bins=(x_edges, y_edges))
+
+        assert_equal(hist[0, 0], 1)
diff --git a/numpy/lib/tests/test_index_tricks.py b/numpy/lib/tests/test_index_tricks.py
index f934e952a..315251daa 100644
--- a/numpy/lib/tests/test_index_tricks.py
+++ b/numpy/lib/tests/test_index_tricks.py
@@ -6,7 +6,7 @@ from numpy.testing import (
     assert_array_almost_equal, assert_raises, assert_raises_regex
     )
 from numpy.lib.index_tricks import (
-    mgrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
+    mgrid, ogrid, ndenumerate, fill_diagonal, diag_indices, diag_indices_from,
     index_exp, ndindex, r_, s_, ix_
     )
 
@@ -156,6 +156,15 @@ class TestGrid(object):
         assert_array_almost_equal(d[1, :, 1] - d[1, :, 0],
                                   0.2*np.ones(20, 'd'), 11)
 
+    def test_sparse(self):
+        grid_full   = mgrid[-1:1:10j, -2:2:10j]
+        grid_sparse = ogrid[-1:1:10j, -2:2:10j]
+
+        # sparse grids can be made dense by broadcasting
+        grid_broadcast = np.broadcast_arrays(*grid_sparse)
+        for f, b in zip(grid_full, grid_broadcast):
+            assert_equal(f, b)
+
 
 class TestConcatenator(object):
     def test_1d(self):
@@ -184,37 +193,6 @@ class TestConcatenator(object):
         assert_array_equal(d[:5, :], b)
         assert_array_equal(d[5:, :], c)
 
-    def test_matrix(self):
-        a = [1, 2]
-        b = [3, 4]
-
-        ab_r = np.r_['r', a, b]
-        ab_c = np.r_['c', a, b]
-
-        assert_equal(type(ab_r), np.matrix)
-        assert_equal(type(ab_c), np.matrix)
-
-        assert_equal(np.array(ab_r), [[1,2,3,4]])
-        assert_equal(np.array(ab_c), [[1],[2],[3],[4]])
-
-        assert_raises(ValueError, lambda: np.r_['rc', a, b])
-
-    def test_matrix_scalar(self):
-        r = np.r_['r', [1, 2], 3]
-        assert_equal(type(r), np.matrix)
-        assert_equal(np.array(r), [[1,2,3]])
-
-    def test_matrix_builder(self):
-        a = np.array([1])
-        b = np.array([2])
-        c = np.array([3])
-        d = np.array([4])
-        actual = np.r_['a, b; c, d']
-        expected = np.bmat([[a, b], [c, d]])
-
-        assert_equal(actual, expected)
-        assert_equal(type(actual), type(expected))
-
     def test_0d(self):
         assert_equal(r_[0, np.array(1), 2], [0, 1, 2])
         assert_equal(r_[[0, 1, 2], np.array(3)], [0, 1, 2, 3])
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 42e221506..504372faf 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -113,42 +113,46 @@ class TestNanFunctions_MinMax(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        mine = np.eye(3).view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
+
         # check that rows of nan are dealt with for subclasses (#4628)
-        mat[1] = np.nan
+        mine[1] = np.nan
         for f in self.nanfuncs:
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat, axis=0)
-                assert_(isinstance(res, np.matrix))
+                res = f(mine, axis=0)
+                assert_(isinstance(res, MyNDArray))
                 assert_(not np.any(np.isnan(res)))
                 assert_(len(w) == 0)
 
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat, axis=1)
-                assert_(isinstance(res, np.matrix))
-                assert_(np.isnan(res[1, 0]) and not np.isnan(res[0, 0])
-                        and not np.isnan(res[2, 0]))
+                res = f(mine, axis=1)
+                assert_(isinstance(res, MyNDArray))
+                assert_(np.isnan(res[1]) and not np.isnan(res[0])
+                        and not np.isnan(res[2]))
                 assert_(len(w) == 1, 'no warning raised')
                 assert_(issubclass(w[0].category, RuntimeWarning))
 
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter('always')
-                res = f(mat)
-                assert_(np.isscalar(res))
+                res = f(mine)
+                assert_(res.shape == ())
                 assert_(res != np.nan)
                 assert_(len(w) == 0)
 
@@ -209,19 +213,22 @@ class TestNanFunctions_ArgminArgmax(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        mine = np.eye(3).view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == (3,))
+            res = f(mine)
+            assert_(res.shape == ())
 
 
 class TestNanFunctions_IntTypes(object):
@@ -381,19 +388,27 @@ class SharedNanFunctionsTestsMixin(object):
         for f in self.nanfuncs:
             assert_(f(0.) == 0.)
 
-    def test_matrices(self):
+    def test_subclass(self):
+        class MyNDArray(np.ndarray):
+            pass
+
         # Check that it works and that type and
         # shape are preserved
-        mat = np.matrix(np.eye(3))
+        array = np.eye(3)
+        mine = array.view(MyNDArray)
         for f in self.nanfuncs:
-            res = f(mat, axis=0)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (1, 3))
-            res = f(mat, axis=1)
-            assert_(isinstance(res, np.matrix))
-            assert_(res.shape == (3, 1))
-            res = f(mat)
-            assert_(np.isscalar(res))
+            expected_shape = f(array, axis=0).shape
+            res = f(mine, axis=0)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array, axis=1).shape
+            res = f(mine, axis=1)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
+            expected_shape = f(array).shape
+            res = f(mine)
+            assert_(isinstance(res, MyNDArray))
+            assert_(res.shape == expected_shape)
 
 
 class TestNanFunctions_SumProd(SharedNanFunctionsTestsMixin):
@@ -481,18 +496,6 @@ class TestNanFunctions_CumSumProd(SharedNanFunctionsTestsMixin):
                 res = f(d, axis=axis)
                 assert_equal(res.shape, (3, 5, 7, 11))
 
-    def test_matrices(self):
-        # Check that it works and that type and
-        # shape are preserved
-        mat = np.matrix(np.eye(3))
-        for f in self.nanfuncs:
-            for axis in np.arange(2):
-                res = f(mat, axis=axis)
-                assert_(isinstance(res, np.matrix))
-                assert_(res.shape == (3, 3))
-            res = f(mat)
-            assert_(res.shape == (1, 3*3))
-
     def test_result_values(self):
         for axis in (-2, -1, 0, 1, None):
             tgt = np.cumprod(_ndat_ones, axis=axis)
diff --git a/numpy/lib/tests/test_shape_base.py b/numpy/lib/tests/test_shape_base.py
index 0e86fbb19..c95894f94 100644
--- a/numpy/lib/tests/test_shape_base.py
+++ b/numpy/lib/tests/test_shape_base.py
@@ -2,16 +2,106 @@ from __future__ import division, absolute_import, print_function
 
 import numpy as np
 import warnings
+import functools
 
 from numpy.lib.shape_base import (
     apply_along_axis, apply_over_axes, array_split, split, hsplit, dsplit,
-    vsplit, dstack, column_stack, kron, tile, expand_dims,
+    vsplit, dstack, column_stack, kron, tile, expand_dims, take_along_axis,
+    put_along_axis
     )
 from numpy.testing import (
     assert_, assert_equal, assert_array_equal, assert_raises, assert_warns
     )
 
 
+def _add_keepdims(func):
+    """ hack in keepdims behavior into a function taking an axis """
+    @functools.wraps(func)
+    def wrapped(a, axis, **kwargs):
+        res = func(a, axis=axis, **kwargs)
+        if axis is None:
+            axis = 0  # res is now a scalar, so we can insert this anywhere
+        return np.expand_dims(res, axis=axis)
+    return wrapped
+
+
+class TestTakeAlongAxis(object):
+    def test_argequivalent(self):
+        """ Test it translates from arg<func> to <func> """
+        from numpy.random import rand
+        a = rand(3, 4, 5)
+
+        funcs = [
+            (np.sort, np.argsort, dict()),
+            (_add_keepdims(np.min), _add_keepdims(np.argmin), dict()),
+            (_add_keepdims(np.max), _add_keepdims(np.argmax), dict()),
+            (np.partition, np.argpartition, dict(kth=2)),
+        ]
+
+        for func, argfunc, kwargs in funcs:
+            for axis in list(range(a.ndim)) + [None]:
+                a_func = func(a, axis=axis, **kwargs)
+                ai_func = argfunc(a, axis=axis, **kwargs)
+                assert_equal(a_func, take_along_axis(a, ai_func, axis=axis))
+
+    def test_invalid(self):
+        """ Test it errors when indices has too few dimensions """
+        a = np.ones((10, 10))
+        ai = np.ones((10, 2), dtype=np.intp)
+
+        # sanity check
+        take_along_axis(a, ai, axis=1)
+
+        # not enough indices
+        assert_raises(ValueError, take_along_axis, a, np.array(1), axis=1)
+        # bool arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(bool), axis=1)
+        # float arrays not allowed
+        assert_raises(IndexError, take_along_axis, a, ai.astype(float), axis=1)
+        # invalid axis
+        assert_raises(np.AxisError, take_along_axis, a, ai, axis=10)
+
+    def test_empty(self):
+        """ Test everything is ok with empty results, even with inserted dims """
+        a  = np.ones((3, 4, 5))
+        ai = np.ones((3, 0, 5), dtype=np.intp)
+
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, ai.shape)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.ones((1, 2, 5), dtype=np.intp)
+        actual = take_along_axis(a, ai, axis=1)
+        assert_equal(actual.shape, (3, 2, 5))
+
+
+class TestPutAlongAxis(object):
+    def test_replace_max(self):
+        a_base = np.array([[10, 30, 20], [60, 40, 50]])
+
+        for axis in list(range(a_base.ndim)) + [None]:
+            # we mutate this in the loop
+            a = a_base.copy()
+
+            # replace the max with a small value
+            i_max = _add_keepdims(np.argmax)(a, axis=axis)
+            put_along_axis(a, i_max, -99, axis=axis)
+
+            # find the new minimum, which should max
+            i_min = _add_keepdims(np.argmin)(a, axis=axis)
+
+            assert_equal(i_min, i_max)
+
+    def test_broadcast(self):
+        """ Test that non-indexing dimensions are broadcast in both directions """
+        a  = np.ones((3, 4, 1))
+        ai = np.arange(10, dtype=np.intp).reshape((1, 2, 5)) % 4
+        put_along_axis(a, ai, 20, axis=1)
+        assert_equal(take_along_axis(a, ai, axis=1), 20)
+
+
 class TestApplyAlongAxis(object):
     def test_simple(self):
         a = np.ones((20, 10), 'd')
@@ -29,19 +119,21 @@ class TestApplyAlongAxis(object):
                            [[27, 30, 33], [36, 39, 42], [45, 48, 51]])
 
     def test_preserve_subclass(self):
-        # this test is particularly malicious because matrix
-        # refuses to become 1d
         def double(row):
             return row * 2
-        m = np.matrix([[0, 1], [2, 3]])
-        expected = np.matrix([[0, 2], [4, 6]])
+
+        class MyNDArray(np.ndarray):
+            pass
+
+        m = np.array([[0, 1], [2, 3]]).view(MyNDArray)
+        expected = np.array([[0, 2], [4, 6]]).view(MyNDArray)
 
         result = apply_along_axis(double, 0, m)
-        assert_(isinstance(result, np.matrix))
+        assert_(isinstance(result, MyNDArray))
         assert_array_equal(result, expected)
 
         result = apply_along_axis(double, 1, m)
-        assert_(isinstance(result, np.matrix))
+        assert_(isinstance(result, MyNDArray))
         assert_array_equal(result, expected)
 
     def test_subclass(self):
@@ -492,16 +584,10 @@ class TestSqueeze(object):
 
 class TestKron(object):
     def test_return_type(self):
-        a = np.ones([2, 2])
-        m = np.asmatrix(a)
-        assert_equal(type(kron(a, a)), np.ndarray)
-        assert_equal(type(kron(m, m)), np.matrix)
-        assert_equal(type(kron(a, m)), np.matrix)
-        assert_equal(type(kron(m, a)), np.matrix)
-
         class myarray(np.ndarray):
             __array_priority__ = 0.0
 
+        a = np.ones([2, 2])
         ma = myarray(a.shape, a.dtype, a.data)
         assert_equal(type(kron(a, a)), np.ndarray)
         assert_equal(type(kron(ma, ma)), myarray)
diff --git a/numpy/lib/twodim_base.py b/numpy/lib/twodim_base.py
index 402c18850..cca316e9a 100644
--- a/numpy/lib/twodim_base.py
+++ b/numpy/lib/twodim_base.py
@@ -650,7 +650,7 @@ def histogram2d(x, y, bins=10, range=None, normed=False, weights=None):
         N = 1
 
     if N != 1 and N != 2:
-        xedges = yedges = asarray(bins, float)
+        xedges = yedges = asarray(bins)
         bins = [xedges, yedges]
     hist, edges = histogramdd([x, y], bins, range, normed, weights)
     return hist, edges[0], edges[1]
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 5ee230f92..98af0733b 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -16,20 +16,20 @@ __all__ = ['matrix_power', 'solve', 'tensorsolve', 'tensorinv', 'inv',
            'svd', 'eig', 'eigh', 'lstsq', 'norm', 'qr', 'cond', 'matrix_rank',
            'LinAlgError', 'multi_dot']
 
+import operator
 import warnings
 
 from numpy.core import (
     array, asarray, zeros, empty, empty_like, intc, single, double,
-    csingle, cdouble, inexact, complexfloating, newaxis, ravel, all, Inf, dot,
-    add, multiply, sqrt, maximum, fastCopyAndTranspose, sum, isfinite, size,
-    finfo, errstate, geterrobj, longdouble, moveaxis, amin, amax, product, abs,
-    broadcast, atleast_2d, intp, asanyarray, object_, ones, matmul,
-    swapaxes, divide, count_nonzero, ndarray, isnan
+    csingle, cdouble, inexact, complexfloating, newaxis, all, Inf, dot,
+    add, multiply, sqrt, fastCopyAndTranspose, sum, isfinite,
+    finfo, errstate, geterrobj, moveaxis, amin, amax, product, abs,
+    atleast_2d, intp, asanyarray, object_, matmul,
+    swapaxes, divide, count_nonzero, isnan
 )
 from numpy.core.multiarray import normalize_axis_index
-from numpy.lib import triu, asfarray
+from numpy.lib.twodim_base import triu, eye
 from numpy.linalg import lapack_lite, _umath_linalg
-from numpy.matrixlib.defmatrix import matrix_power
 
 # For Python2/3 compatibility
 _N = b'N'
@@ -210,7 +210,8 @@ def _assertSquareness(*arrays):
 
 def _assertNdSquareness(*arrays):
     for a in arrays:
-        if max(a.shape[-2:]) != min(a.shape[-2:]):
+        m, n = a.shape[-2:]
+        if m != n:
             raise LinAlgError('Last 2 dimensions of the array must be square')
 
 def _assertFinite(*arrays):
@@ -532,6 +533,109 @@ def inv(a):
     return wrap(ainv.astype(result_t, copy=False))
 
 
+def matrix_power(a, n):
+    """
+    Raise a square matrix to the (integer) power `n`.
+
+    For positive integers `n`, the power is computed by repeated matrix
+    squarings and matrix multiplications. If ``n == 0``, the identity matrix
+    of the same shape as M is returned. If ``n < 0``, the inverse
+    is computed and then raised to the ``abs(n)``.
+
+    Parameters
+    ----------
+    a : (..., M, M) array_like
+        Matrix to be "powered."
+    n : int
+        The exponent can be any integer or long integer, positive,
+        negative, or zero.
+
+    Returns
+    -------
+    a**n : (..., M, M) ndarray or matrix object
+        The return value is the same shape and type as `M`;
+        if the exponent is positive or zero then the type of the
+        elements is the same as those of `M`. If the exponent is
+        negative the elements are floating-point.
+
+    Raises
+    ------
+    LinAlgError
+        For matrices that are not square or that (for negative powers) cannot
+        be inverted numerically.
+
+    Examples
+    --------
+    >>> from numpy.linalg import matrix_power
+    >>> i = np.array([[0, 1], [-1, 0]]) # matrix equiv. of the imaginary unit
+    >>> matrix_power(i, 3) # should = -i
+    array([[ 0, -1],
+           [ 1,  0]])
+    >>> matrix_power(i, 0)
+    array([[1, 0],
+           [0, 1]])
+    >>> matrix_power(i, -3) # should = 1/(-i) = i, but w/ f.p. elements
+    array([[ 0.,  1.],
+           [-1.,  0.]])
+
+    Somewhat more sophisticated example
+
+    >>> q = np.zeros((4, 4))
+    >>> q[0:2, 0:2] = -i
+    >>> q[2:4, 2:4] = i
+    >>> q # one of the three quaternion units not equal to 1
+    array([[ 0., -1.,  0.,  0.],
+           [ 1.,  0.,  0.,  0.],
+           [ 0.,  0.,  0.,  1.],
+           [ 0.,  0., -1.,  0.]])
+    >>> matrix_power(q, 2) # = -np.eye(4)
+    array([[-1.,  0.,  0.,  0.],
+           [ 0., -1.,  0.,  0.],
+           [ 0.,  0., -1.,  0.],
+           [ 0.,  0.,  0., -1.]])
+
+    """
+    a = asanyarray(a)
+    _assertRankAtLeast2(a)
+    _assertNdSquareness(a)
+
+    try:
+        n = operator.index(n)
+    except TypeError:
+        raise TypeError("exponent must be an integer")
+
+    if n == 0:
+        a = empty_like(a)
+        a[...] = eye(a.shape[-2], dtype=a.dtype)
+        return a
+
+    elif n < 0:
+        a = inv(a)
+        n = abs(n)
+
+    # short-cuts.
+    if n == 1:
+        return a
+
+    elif n == 2:
+        return matmul(a, a)
+
+    elif n == 3:
+        return matmul(matmul(a, a), a)
+
+    # Use binary decomposition to reduce the number of matrix multiplications.
+    # Here, we iterate over the bits of n, from LSB to MSB, raise `a` to
+    # increasing powers of 2, and multiply into the result as needed.
+    z = result = None
+    while n > 0:
+        z = a if z is None else matmul(z, z)
+        n, bit = divmod(n, 2)
+        if bit:
+            result = z if result is None else matmul(result, z)
+
+    return result
+
+
 # Cholesky decomposition
 
 def cholesky(a):
@@ -1429,8 +1533,7 @@ def svd(a, full_matrices=True, compute_uv=True):
 
     extobj = get_linalg_error_extobj(_raise_linalgerror_svd_nonconvergence)
 
-    m = a.shape[-2]
-    n = a.shape[-1]
+    m, n = a.shape[-2:]
     if compute_uv:
         if full_matrices:
             if m < n:
@@ -1750,7 +1853,8 @@ def pinv(a, rcond=1e-15 ):
     a, wrap = _makearray(a)
     rcond = asarray(rcond)
     if _isEmpty2d(a):
-        res = empty(a.shape[:-2] + (a.shape[-1], a.shape[-2]), dtype=a.dtype)
+        m, n = a.shape[-2:]
+        res = empty(a.shape[:-2] + (n, m), dtype=a.dtype)
         return wrap(res)
     a = a.conjugate()
     u, s, vt = svd(a, full_matrices=False)
@@ -2007,10 +2111,9 @@ def lstsq(a, b, rcond="warn"):
         b = b[:, newaxis]
     _assertRank2(a, b)
     _assertNoEmpty2d(a, b)  # TODO: relax this constraint
-    m  = a.shape[0]
-    n  = a.shape[1]
-    n_rhs = b.shape[1]
-    if m != b.shape[0]:
+    m, n = a.shape[-2:]
+    m2, n_rhs = b.shape[-2:]
+    if m != m2:
         raise LinAlgError('Incompatible dimensions')
 
     t, result_t = _commonType(a, b)
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index b3dd2e4ae..87dfe988a 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -7,11 +7,13 @@ import os
 import sys
 import itertools
 import traceback
+import textwrap
+import subprocess
 import pytest
 
 import numpy as np
-from numpy import array, single, double, csingle, cdouble, dot, identity
-from numpy import multiply, atleast_2d, inf, asarray
+from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
+from numpy import multiply, atleast_2d, inf, asarray, matrix
 from numpy import linalg
 from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
 from numpy.linalg.linalg import _multi_dot_matrix_chain_order
@@ -445,8 +447,7 @@ def identity_like_generalized(a):
     a = asarray(a)
     if a.ndim >= 3:
         r = np.empty(a.shape, dtype=a.dtype)
-        for c in itertools.product(*map(range, a.shape[:-2])):
-            r[c] = identity(a.shape[-2])
+        r[...] = identity(a.shape[-2])
         return r
     else:
         return identity(a.shape[0])
@@ -927,16 +928,21 @@ class TestMatrixPower(object):
     R90 = array([[0, 1], [-1, 0]])
     Arb22 = array([[4, -7], [-2, 10]])
     noninv = array([[1, 0], [0, 0]])
-    arbfloat = array([[0.1, 3.2], [1.2, 0.7]])
+    arbfloat = array([[[0.1, 3.2], [1.2, 0.7]],
+                      [[0.2, 6.4], [2.4, 1.4]]])
 
     large = identity(10)
     t = large[1, :].copy()
-    large[1, :] = large[0,:]
+    large[1, :] = large[0, :]
     large[0, :] = t
 
     def test_large_power(self):
         assert_equal(
             matrix_power(self.R90, 2 ** 100 + 2 ** 10 + 2 ** 5 + 1), self.R90)
+        assert_equal(
+            matrix_power(self.R90, 2 ** 100 + 2 ** 10 + 1), self.R90)
+        assert_equal(
+            matrix_power(self.R90, 2 ** 100 + 2 + 1), -self.R90)
 
     def test_large_power_trailing_zero(self):
         assert_equal(
@@ -945,7 +951,7 @@ class TestMatrixPower(object):
     def testip_zero(self):
         def tz(M):
             mz = matrix_power(M, 0)
-            assert_equal(mz, identity(M.shape[0]))
+            assert_equal(mz, identity_like_generalized(M))
             assert_equal(mz.dtype, M.dtype)
         for M in [self.Arb22, self.arbfloat, self.large]:
             tz(M)
@@ -961,7 +967,7 @@ class TestMatrixPower(object):
     def testip_two(self):
         def tz(M):
             mz = matrix_power(M, 2)
-            assert_equal(mz, dot(M, M))
+            assert_equal(mz, matmul(M, M))
             assert_equal(mz.dtype, M.dtype)
         for M in [self.Arb22, self.arbfloat, self.large]:
             tz(M)
@@ -969,14 +975,19 @@ class TestMatrixPower(object):
     def testip_invert(self):
         def tz(M):
             mz = matrix_power(M, -1)
-            assert_almost_equal(identity(M.shape[0]), dot(mz, M))
+            assert_almost_equal(matmul(mz, M), identity_like_generalized(M))
         for M in [self.R90, self.Arb22, self.arbfloat, self.large]:
             tz(M)
 
     def test_invert_noninvertible(self):
-        import numpy.linalg
-        assert_raises(numpy.linalg.linalg.LinAlgError,
-                      lambda: matrix_power(self.noninv, -1))
+        assert_raises(LinAlgError, matrix_power, self.noninv, -1)
+
+    def test_invalid(self):
+        assert_raises(TypeError, matrix_power, self.R90, 1.5)
+        assert_raises(TypeError, matrix_power, self.R90, [1])
+        assert_raises(LinAlgError, matrix_power, np.array([1]), 1)
+        assert_raises(LinAlgError, matrix_power, np.array([[1], [2]]), 1)
+        assert_raises(LinAlgError, matrix_power, np.ones((4, 3, 2)), 1)
 
 
 class TestBoolPower(object):
@@ -1752,6 +1763,40 @@ def test_xerbla_override():
             raise SkipTest('Numpy xerbla not linked in.')
 
 
+def test_sdot_bug_8577():
+    # Regression test that loading certain other libraries does not
+    # result to wrong results in float32 linear algebra.
+    #
+    # There's a bug gh-8577 on OSX that can trigger this, and perhaps
+    # there are also other situations in which it occurs.
+    #
+    # Do the check in a separate process.
+
+    bad_libs = ['PyQt5.QtWidgets', 'IPython']
+
+    template = textwrap.dedent("""
+    import sys
+    {before}
+    try:
+        import {bad_lib}
+    except ImportError:
+        sys.exit(0)
+    {after}
+    x = np.ones(2, dtype=np.float32)
+    sys.exit(0 if np.allclose(x.dot(x), 2.0) else 1)
+    """)
+
+    for bad_lib in bad_libs:
+        code = template.format(before="import numpy as np", after="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+        # Swapped import order
+        code = template.format(after="import numpy as np", before="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+
 class TestMultiDot(object):
 
     def test_basic_function_with_three_arguments(self):
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 3a5ad7250..7dc1cb0cb 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -3202,7 +3202,7 @@ static void
                     for (i = 0; i < nrhs; i++) {
                         @ftyp@ *vector = components + i*m;
                         /* Numpy and fortran floating types are the same size,
-                         * so this case is safe */
+                         * so this cast is safe */
                         @basetyp@ abs2 = @TYPE@_abs2((@typ@ *)vector, excess);
                         memcpy(
                             resid + i*r_out.column_strides,
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index fb28fa8e5..5ed086db3 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -2799,13 +2799,8 @@ class MaskedArray(ndarray):
             # FIXME _sharedmask is never used.
             _sharedmask = True
         # Process mask.
-        # Number of named fields (or zero if none)
-        names_ = _data.dtype.names or ()
         # Type of the mask
-        if names_:
-            mdtype = make_mask_descr(_data.dtype)
-        else:
-            mdtype = MaskType
+        mdtype = make_mask_descr(_data.dtype)
 
         if mask is nomask:
             # Case 1. : no mask in input.
@@ -2831,14 +2826,12 @@ class MaskedArray(ndarray):
                     _data._mask = mask
                     _data._sharedmask = False
             else:
+                _data._sharedmask = not copy
                 if copy:
                     _data._mask = _data._mask.copy()
-                    _data._sharedmask = False
                     # Reset the shape of the original mask
                     if getmask(data) is not nomask:
                         data._mask.shape = data.shape
-                else:
-                    _data._sharedmask = True
         else:
             # Case 2. : With a mask in input.
             # If mask is boolean, create an array of True or False
@@ -2875,7 +2868,7 @@ class MaskedArray(ndarray):
                     _data._mask = mask
                     _data._sharedmask = not copy
                 else:
-                    if names_:
+                    if _data.dtype.names:
                         def _recursive_or(a, b):
                             "do a|=b on each field of a, recursively"
                             for name in a.dtype.names:
@@ -2884,7 +2877,7 @@ class MaskedArray(ndarray):
                                     _recursive_or(af, bf)
                                 else:
                                     af |= bf
-                            return
+
                         _recursive_or(_data._mask, mask)
                     else:
                         _data._mask = np.logical_or(mask, _data._mask)
@@ -2999,7 +2992,9 @@ class MaskedArray(ndarray):
                     order = "K"
 
                 _mask = _mask.astype(_mask_dtype, order)
-
+            else:
+                # Take a view so shape changes, etc., do not propagate back.
+                _mask = _mask.view()
         else:
             _mask = nomask
 
@@ -3344,17 +3339,35 @@ class MaskedArray(ndarray):
             _mask[indx] = mindx
         return
 
-    def __setattr__(self, attr, value):
-        super(MaskedArray, self).__setattr__(attr, value)
-        if attr == 'dtype' and self._mask is not nomask:
-            self._mask = self._mask.view(make_mask_descr(value), ndarray)
-            # Try to reset the shape of the mask (if we don't have a void)
-            # This raises a ValueError if the dtype change won't work
+    # Define so that we can overwrite the setter.
+    @property
+    def dtype(self):
+        return super(MaskedArray, self).dtype
+
+    @dtype.setter
+    def dtype(self, dtype):
+        super(MaskedArray, type(self)).dtype.__set__(self, dtype)
+        if self._mask is not nomask:
+            self._mask = self._mask.view(make_mask_descr(dtype), ndarray)
+            # Try to reset the shape of the mask (if we don't have a void).
+            # This raises a ValueError if the dtype change won't work.
             try:
                 self._mask.shape = self.shape
             except (AttributeError, TypeError):
                 pass
 
+    @property
+    def shape(self):
+        return super(MaskedArray, self).shape
+
+    @shape.setter
+    def shape(self, shape):
+        super(MaskedArray, type(self)).shape.__set__(self, shape)
+        # Cannot use self._mask, since it may not (yet) exist when a
+        # masked matrix sets the shape.
+        if getmask(self) is not nomask:
+            self._mask.shape = self.shape
+
     def __setmask__(self, mask, copy=False):
         """
         Set the mask.
@@ -5531,15 +5544,7 @@ class MaskedArray(ndarray):
         sidx = self.argsort(axis=axis, kind=kind, order=order,
                             fill_value=fill_value, endwith=endwith)
 
-        # save memory for 1d arrays
-        if self.ndim == 1:
-            idx = sidx
-        else:
-            idx = list(np.ix_(*[np.arange(x) for x in self.shape]))
-            idx[axis] = sidx
-            idx = tuple(idx)
-
-        self[...] = self[idx]
+        self[...] = np.take_along_axis(self, sidx, axis=axis)
 
     def min(self, axis=None, out=None, fill_value=None, keepdims=np._NoValue):
         """
@@ -6317,6 +6322,12 @@ class MaskedConstant(MaskedArray):
         # precedent for this with `np.bool_` scalars.
         return self
 
+    def __copy__(self):
+        return self
+		
+    def __deepcopy__(self, memo):
+        return self
+
     def __setattr__(self, attr, value):
         if not self.__has_singleton():
             # allow the singleton to be initialized
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
index da35217d1..3be4d3625 100644
--- a/numpy/ma/extras.py
+++ b/numpy/ma/extras.py
@@ -747,19 +747,17 @@ def _median(a, axis=None, out=None, overwrite_input=False):
             return np.ma.minimum_fill_value(asorted)
         return s
 
-    counts = count(asorted, axis=axis)
+    counts = count(asorted, axis=axis, keepdims=True)
     h = counts // 2
 
-    # create indexing mesh grid for all but reduced axis
-    axes_grid = [np.arange(x) for i, x in enumerate(asorted.shape)
-                 if i != axis]
-    ind = np.meshgrid(*axes_grid, sparse=True, indexing='ij')
+    # duplicate high if odd number of elements so mean does nothing
+    odd = counts % 2 == 1
+    l = np.where(odd, h, h-1)
 
-    # insert indices of low and high median
-    ind.insert(axis, h - 1)
-    low = asorted[tuple(ind)]
-    ind[axis] = np.minimum(h, asorted.shape[axis] - 1)
-    high = asorted[tuple(ind)]
+    lh = np.concatenate([l,h], axis=axis)
+
+    # get low and high median
+    low_high = np.take_along_axis(asorted, lh, axis=axis)
 
     def replace_masked(s):
         # Replace masked entries with minimum_full_value unless it all values
@@ -767,30 +765,20 @@ def _median(a, axis=None, out=None, overwrite_input=False):
         # larger than the fill value is undefined and a valid value placed
         # elsewhere, e.g. [4, --, inf].
         if np.ma.is_masked(s):
-            rep = (~np.all(asorted.mask, axis=axis)) & s.mask
+            rep = (~np.all(asorted.mask, axis=axis, keepdims=True)) & s.mask
             s.data[rep] = np.ma.minimum_fill_value(asorted)
             s.mask[rep] = False
 
-    replace_masked(low)
-    replace_masked(high)
-
-    # duplicate high if odd number of elements so mean does nothing
-    odd = counts % 2 == 1
-    np.copyto(low, high, where=odd)
-    # not necessary for scalar True/False masks
-    try:
-        np.copyto(low.mask, high.mask, where=odd)
-    except Exception:
-        pass
+    replace_masked(low_high)
 
     if np.issubdtype(asorted.dtype, np.inexact):
         # avoid inf / x = masked
-        s = np.ma.sum([low, high], axis=0, out=out)
+        s = np.ma.sum(low_high, axis=axis, out=out)
         np.true_divide(s.data, 2., casting='unsafe', out=s.data)
 
         s = np.lib.utils._median_nancheck(asorted, s, axis, out)
     else:
-        s = np.ma.mean([low, high], axis=0, out=out)
+        s = np.ma.mean(low_high, axis=axis, out=out)
 
     return s
 
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index 63703f6cd..51616f214 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -352,9 +352,11 @@ class TestMaskedArray(object):
         assert_equal(y1._mask.__array_interface__, m.__array_interface__)
 
         y1a = array(y1)
+        # Default for masked array is not to copy; see gh-10318.
         assert_(y1a._data.__array_interface__ ==
                         y1._data.__array_interface__)
-        assert_(y1a.mask is y1.mask)
+        assert_(y1a._mask.__array_interface__ ==
+                        y1._mask.__array_interface__)
 
         y2 = array(x1, mask=m3)
         assert_(y2._data.__array_interface__ == x1.__array_interface__)
@@ -4826,6 +4828,16 @@ class TestMaskedConstant(object):
             np.ma.masked.copy() is np.ma.masked,
             np.True_.copy() is np.True_)
 
+    def test__copy(self):
+        import copy
+        assert_(
+            copy.copy(np.ma.masked) is np.ma.masked)
+
+    def test_deepcopy(self):
+        import copy
+        assert_(
+            copy.deepcopy(np.ma.masked) is np.ma.masked)
+
     def test_immutable(self):
         orig = np.ma.masked
         assert_raises(np.ma.core.MaskError, operator.setitem, orig, (), 1)
diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
index 2d5e30b2c..c29bec2bd 100644
--- a/numpy/ma/tests/test_extras.py
+++ b/numpy/ma/tests/test_extras.py
@@ -307,21 +307,6 @@ class TestConcatenator(object):
         assert_array_equal(d[5:,:], b_2)
         assert_array_equal(d.mask, np.r_[m_1, m_2])
 
-    def test_matrix_builder(self):
-        assert_raises(np.ma.MAError, lambda: mr_['1, 2; 3, 4'])
-
-    def test_matrix(self):
-        # Test consistency with unmasked version.  If we ever deprecate
-        # matrix, this test should either still pass, or both actual and
-        # expected should fail to be build.
-        actual = mr_['r', 1, 2, 3]
-        expected = np.ma.array(np.r_['r', 1, 2, 3])
-        assert_array_equal(actual, expected)
-
-        # outer type is masked array, inner type is matrix
-        assert_equal(type(actual), type(expected))
-        assert_equal(type(actual.data), type(expected.data))
-
     def test_masked_constant(self):
         actual = mr_[np.ma.masked, 1]
         assert_equal(actual.mask, [True, False])
diff --git a/numpy/ma/tests/test_old_ma.py b/numpy/ma/tests/test_old_ma.py
index 70eab0edc..d7b1e3c18 100644
--- a/numpy/ma/tests/test_old_ma.py
+++ b/numpy/ma/tests/test_old_ma.py
@@ -273,7 +273,11 @@ class TestMa(object):
         assert_(y1.mask is m)
 
         y1a = array(y1, copy=0)
-        assert_(y1a.mask is y1.mask)
+        # For copy=False, one might expect that the array would just
+        # passed on, i.e., that it would be "is" instead of "==".
+        # See gh-4043 for discussion.
+        assert_(y1a._mask.__array_interface__ ==
+                y1._mask.__array_interface__)
 
         y2 = array(x1, mask=m3, copy=0)
         assert_(y2.mask is m3)
diff --git a/numpy/ma/tests/test_regression.py b/numpy/ma/tests/test_regression.py
index 04e10d9d1..96c418a51 100644
--- a/numpy/ma/tests/test_regression.py
+++ b/numpy/ma/tests/test_regression.py
@@ -74,3 +74,13 @@ class TestRegression(object):
             r1 = np.ma.corrcoef(x, y, ddof=1)
             # ddof should not have an effect (it gets cancelled out)
             assert_allclose(r0.data, r1.data)
+
+    def test_mask_not_backmangled(self):
+        # See gh-10314.  Test case taken from gh-3140.
+        a = np.ma.MaskedArray([1., 2.], mask=[False, False])
+        assert_(a.mask.shape == (2,))
+        b = np.tile(a, (2, 1))
+        # Check that the above no longer changes a.shape to (1, 2)
+        assert_(a.mask.shape == (2,))
+        assert_(b.shape == (2, 2))
+        assert_(b.mask.shape == (2, 2))
diff --git a/numpy/matrixlib/defmatrix.py b/numpy/matrixlib/defmatrix.py
index 1f5c94921..7baa401a8 100644
--- a/numpy/matrixlib/defmatrix.py
+++ b/numpy/matrixlib/defmatrix.py
@@ -3,10 +3,14 @@ from __future__ import division, absolute_import, print_function
 __all__ = ['matrix', 'bmat', 'mat', 'asmatrix']
 
 import sys
+import warnings
 import ast
 import numpy.core.numeric as N
-from numpy.core.numeric import concatenate, isscalar, binary_repr, identity, asanyarray
-from numpy.core.numerictypes import issubdtype
+from numpy.core.numeric import concatenate, isscalar
+# While not in __all__, matrix_power used to be defined here, so we import
+# it for backward compatibility.
+from numpy.linalg import matrix_power
+
 
 def _convert_from_string(data):
     for char in '[]':
@@ -63,118 +67,14 @@ def asmatrix(data, dtype=None):
     """
     return matrix(data, dtype=dtype, copy=False)
 
-def matrix_power(M, n):
-    """
-    Raise a square matrix to the (integer) power `n`.
-
-    For positive integers `n`, the power is computed by repeated matrix
-    squarings and matrix multiplications. If ``n == 0``, the identity matrix
-    of the same shape as M is returned. If ``n < 0``, the inverse
-    is computed and then raised to the ``abs(n)``.
-
-    Parameters
-    ----------
-    M : ndarray or matrix object
-        Matrix to be "powered."  Must be square, i.e. ``M.shape == (m, m)``,
-        with `m` a positive integer.
-    n : int
-        The exponent can be any integer or long integer, positive,
-        negative, or zero.
-
-    Returns
-    -------
-    M**n : ndarray or matrix object
-        The return value is the same shape and type as `M`;
-        if the exponent is positive or zero then the type of the
-        elements is the same as those of `M`. If the exponent is
-        negative the elements are floating-point.
-
-    Raises
-    ------
-    LinAlgError
-        If the matrix is not numerically invertible.
-
-    See Also
-    --------
-    matrix
-        Provides an equivalent function as the exponentiation operator
-        (``**``, not ``^``).
-
-    Examples
-    --------
-    >>> from numpy import linalg as LA
-    >>> i = np.array([[0, 1], [-1, 0]]) # matrix equiv. of the imaginary unit
-    >>> LA.matrix_power(i, 3) # should = -i
-    array([[ 0, -1],
-           [ 1,  0]])
-    >>> LA.matrix_power(np.matrix(i), 3) # matrix arg returns matrix
-    matrix([[ 0, -1],
-            [ 1,  0]])
-    >>> LA.matrix_power(i, 0)
-    array([[1, 0],
-           [0, 1]])
-    >>> LA.matrix_power(i, -3) # should = 1/(-i) = i, but w/ f.p. elements
-    array([[ 0.,  1.],
-           [-1.,  0.]])
-
-    Somewhat more sophisticated example
-
-    >>> q = np.zeros((4, 4))
-    >>> q[0:2, 0:2] = -i
-    >>> q[2:4, 2:4] = i
-    >>> q # one of the three quaternion units not equal to 1
-    array([[ 0., -1.,  0.,  0.],
-           [ 1.,  0.,  0.,  0.],
-           [ 0.,  0.,  0.,  1.],
-           [ 0.,  0., -1.,  0.]])
-    >>> LA.matrix_power(q, 2) # = -np.eye(4)
-    array([[-1.,  0.,  0.,  0.],
-           [ 0., -1.,  0.,  0.],
-           [ 0.,  0., -1.,  0.],
-           [ 0.,  0.,  0., -1.]])
-
-    """
-    M = asanyarray(M)
-    if M.ndim != 2 or M.shape[0] != M.shape[1]:
-        raise ValueError("input must be a square array")
-    if not issubdtype(type(n), N.integer):
-        raise TypeError("exponent must be an integer")
-
-    from numpy.linalg import inv
-
-    if n==0:
-        M = M.copy()
-        M[:] = identity(M.shape[0])
-        return M
-    elif n<0:
-        M = inv(M)
-        n *= -1
-
-    result = M
-    if n <= 3:
-        for _ in range(n-1):
-            result=N.dot(result, M)
-        return result
-
-    # binary decomposition to reduce the number of Matrix
-    # multiplications for n > 3.
-    beta = binary_repr(n)
-    Z, q, t = M, 0, len(beta)
-    while beta[t-q-1] == '0':
-        Z = N.dot(Z, Z)
-        q += 1
-    result = Z
-    for k in range(q+1, t):
-        Z = N.dot(Z, Z)
-        if beta[t-k-1] == '1':
-            result = N.dot(result, Z)
-    return result
-
-
 class matrix(N.ndarray):
     """
     matrix(data, dtype=None, copy=True)
 
+    .. note:: It is no longer recommended to use this class, even for linear
+              algebra. Instead use regular arrays. The class may be removed
+              in the future.
+
     Returns a matrix from an array-like object, or from a string of data.
     A matrix is a specialized 2-D array that retains its 2-D nature
     through operations.  It has certain special operators, such as ``*``
@@ -210,6 +110,12 @@ class matrix(N.ndarray):
     """
     __array_priority__ = 10.0
     def __new__(subtype, data, dtype=None, copy=True):
+        warnings.warn('the matrix subclass is not the recommended way to '
+                      'represent matrices or deal with linear algebra (see '
+                      'https://docs.scipy.org/doc/numpy/user/'
+                      'numpy-for-matlab-users.html). '
+                      'Please adjust your code to use regular ndarray.',
+                      PendingDeprecationWarning, stacklevel=2)
         if isinstance(data, matrix):
             dtype2 = data.dtype
             if (dtype is None):
diff --git a/numpy/matrixlib/tests/test_defmatrix.py b/numpy/matrixlib/tests/test_defmatrix.py
index a02a05c09..e74e83cdb 100644
--- a/numpy/matrixlib/tests/test_defmatrix.py
+++ b/numpy/matrixlib/tests/test_defmatrix.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 try:
     # Accessing collections abstract classes from collections
     # has been deprecated since Python 3.3
@@ -13,7 +21,7 @@ from numpy.testing import (
     assert_, assert_equal, assert_almost_equal, assert_array_equal,
     assert_array_almost_equal, assert_raises
     )
-from numpy.matrixlib.defmatrix import matrix_power
+from numpy.linalg import matrix_power
 from numpy.matrixlib import mat
 
 class TestCtor(object):
diff --git a/numpy/matrixlib/tests/test_interaction.py b/numpy/matrixlib/tests/test_interaction.py
new file mode 100644
index 000000000..fb4d8f98c
--- /dev/null
+++ b/numpy/matrixlib/tests/test_interaction.py
@@ -0,0 +1,369 @@
+"""Tests of interaction of matrix with other parts of numpy.
+
+Note that tests with MaskedArray and linalg are done in separate files.
+"""
+from __future__ import division, absolute_import, print_function
+
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
+import textwrap
+import warnings
+
+import numpy as np
+from numpy.testing import (assert_, assert_equal, assert_raises,
+                           assert_raises_regex, assert_array_equal,
+                           assert_almost_equal, assert_array_almost_equal)
+
+
+def test_fancy_indexing():
+    # The matrix class messes with the shape. While this is always
+    # weird (getitem is not used, it does not have setitem nor knows
+    # about fancy indexing), this tests gh-3110
+    # 2018-04-29: moved here from core.tests.test_index.
+    m = np.matrix([[1, 2], [3, 4]])
+
+    assert_(isinstance(m[[0, 1, 0], :], np.matrix))
+
+    # gh-3110. Note the transpose currently because matrices do *not*
+    # support dimension fixing for fancy indexing correctly.
+    x = np.asmatrix(np.arange(50).reshape(5, 10))
+    assert_equal(x[:2, np.array(-1)], x[:2, -1].T)
+
+
+def test_polynomial_mapdomain():
+    # test that polynomial preserved matrix subtype.
+    # 2018-04-29: moved here from polynomial.tests.polyutils.
+    dom1 = [0, 4]
+    dom2 = [1, 3]
+    x = np.matrix([dom1, dom1])
+    res = np.polynomial.polyutils.mapdomain(x, dom1, dom2)
+    assert_(isinstance(res, np.matrix))
+
+
+def test_sort_matrix_none():
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    a = np.matrix([[2, 1, 0]])
+    actual = np.sort(a, axis=None)
+    expected = np.matrix([[0, 1, 2]])
+    assert_equal(actual, expected)
+    assert_(type(expected) is np.matrix)
+
+
+def test_partition_matrix_none():
+    # gh-4301
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    a = np.matrix([[2, 1, 0]])
+    actual = np.partition(a, 1, axis=None)
+    expected = np.matrix([[0, 1, 2]])
+    assert_equal(actual, expected)
+    assert_(type(expected) is np.matrix)
+
+
+def test_dot_scalar_and_matrix_of_objects():
+    # Ticket #2469
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.dot(arr, 3), desired)
+    assert_equal(np.dot(3, arr), desired)
+
+
+def test_inner_scalar_and_matrix():
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    for dt in np.typecodes['AllInteger'] + np.typecodes['AllFloat'] + '?':
+        sca = np.array(3, dtype=dt)[()]
+        arr = np.matrix([[1, 2], [3, 4]], dtype=dt)
+        desired = np.matrix([[3, 6], [9, 12]], dtype=dt)
+        assert_equal(np.inner(arr, sca), desired)
+        assert_equal(np.inner(sca, arr), desired)
+
+
+def test_inner_scalar_and_matrix_of_objects():
+    # Ticket #4482
+    # 2018-04-29: moved here from core.tests.test_multiarray
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.inner(arr, 3), desired)
+    assert_equal(np.inner(3, arr), desired)
+
+
+def test_iter_allocate_output_subtype():
+    # Make sure that the subtype with priority wins
+    # 2018-04-29: moved here from core.tests.test_nditer, given the
+    # matrix specific shape test.
+
+    # matrix vs ndarray
+    a = np.matrix([[1, 2], [3, 4]])
+    b = np.arange(4).reshape(2, 2).T
+    i = np.nditer([a, b, None], [],
+                  [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    assert_(type(i.operands[2]) is np.matrix)
+    assert_(type(i.operands[2]) is not np.ndarray)
+    assert_equal(i.operands[2].shape, (2, 2))
+
+    # matrix always wants things to be 2D
+    b = np.arange(4).reshape(1, 2, 2)
+    assert_raises(RuntimeError, np.nditer, [a, b, None], [],
+                  [['readonly'], ['readonly'], ['writeonly', 'allocate']])
+    # but if subtypes are disabled, the result can still work
+    i = np.nditer([a, b, None], [],
+                  [['readonly'], ['readonly'],
+                   ['writeonly', 'allocate', 'no_subtype']])
+    assert_(type(i.operands[2]) is np.ndarray)
+    assert_(type(i.operands[2]) is not np.matrix)
+    assert_equal(i.operands[2].shape, (1, 2, 2))
+
+
+def like_function():
+    # 2018-04-29: moved here from core.tests.test_numeric
+    a = np.matrix([[1, 2], [3, 4]])
+    for like_function in np.zeros_like, np.ones_like, np.empty_like:
+        b = like_function(a)
+        assert_(type(b) is np.matrix)
+
+        c = like_function(a, subok=False)
+        assert_(type(c) is not np.matrix)
+
+
+def test_array_astype():
+    # 2018-04-29: copied here from core.tests.test_api
+    # subok=True passes through a matrix
+    a = np.matrix([[0, 1, 2], [3, 4, 5]], dtype='f4')
+    b = a.astype('f4', subok=True, copy=False)
+    assert_(a is b)
+
+    # subok=True is default, and creates a subtype on a cast
+    b = a.astype('i4', copy=False)
+    assert_equal(a, b)
+    assert_equal(type(b), np.matrix)
+
+    # subok=False never returns a matrix
+    b = a.astype('f4', subok=False, copy=False)
+    assert_equal(a, b)
+    assert_(not (a is b))
+    assert_(type(b) is not np.matrix)
+
+
+def test_stack():
+    # 2018-04-29: copied here from core.tests.test_shape_base
+    # check np.matrix cannot be stacked
+    m = np.matrix([[1, 2], [3, 4]])
+    assert_raises_regex(ValueError, 'shape too large to be a matrix',
+                        np.stack, [m, m])
+
+
+def test_object_scalar_multiply():
+    # Tickets #2469 and #4482
+    # 2018-04-29: moved here from core.tests.test_ufunc
+    arr = np.matrix([1, 2], dtype=object)
+    desired = np.matrix([[3, 6]], dtype=object)
+    assert_equal(np.multiply(arr, 3), desired)
+    assert_equal(np.multiply(3, arr), desired)
+
+
+def test_nanfunctions_matrices():
+    # Check that it works and that type and
+    # shape are preserved
+    # 2018-04-29: moved here from core.tests.test_nanfunctions
+    mat = np.matrix(np.eye(3))
+    for f in [np.nanmin, np.nanmax]:
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 1))
+        res = f(mat)
+        assert_(np.isscalar(res))
+    # check that rows of nan are dealt with for subclasses (#4628)
+    mat[1] = np.nan
+    for f in [np.nanmin, np.nanmax]:
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat, axis=0)
+            assert_(isinstance(res, np.matrix))
+            assert_(not np.any(np.isnan(res)))
+            assert_(len(w) == 0)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat, axis=1)
+            assert_(isinstance(res, np.matrix))
+            assert_(np.isnan(res[1, 0]) and not np.isnan(res[0, 0])
+                    and not np.isnan(res[2, 0]))
+            assert_(len(w) == 1, 'no warning raised')
+            assert_(issubclass(w[0].category, RuntimeWarning))
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')
+            res = f(mat)
+            assert_(np.isscalar(res))
+            assert_(res != np.nan)
+            assert_(len(w) == 0)
+
+
+def test_nanfunctions_matrices_general():
+    # Check that it works and that type and
+    # shape are preserved
+    # 2018-04-29: moved here from core.tests.test_nanfunctions
+    mat = np.matrix(np.eye(3))
+    for f in (np.nanargmin, np.nanargmax, np.nansum, np.nanprod,
+              np.nanmean, np.nanvar, np.nanstd):
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 1))
+        res = f(mat)
+        assert_(np.isscalar(res))
+
+    for f in np.nancumsum, np.nancumprod:
+        res = f(mat, axis=0)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 3))
+        res = f(mat, axis=1)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (3, 3))
+        res = f(mat)
+        assert_(isinstance(res, np.matrix))
+        assert_(res.shape == (1, 3*3))
+
+
+def test_average_matrix():
+    # 2018-04-29: moved here from core.tests.test_function_base.
+    y = np.matrix(np.random.rand(5, 5))
+    assert_array_equal(y.mean(0), np.average(y, 0))
+
+    a = np.matrix([[1, 2], [3, 4]])
+    w = np.matrix([[1, 2], [3, 4]])
+
+    r = np.average(a, axis=0, weights=w)
+    assert_equal(type(r), np.matrix)
+    assert_equal(r, [[2.5, 10.0/3]])
+
+
+def test_trapz_matrix():
+    # Test to make sure matrices give the same answer as ndarrays
+    # 2018-04-29: moved here from core.tests.test_function_base.
+    x = np.linspace(0, 5)
+    y = x * x
+    r = np.trapz(y, x)
+    mx = np.matrix(x)
+    my = np.matrix(y)
+    mr = np.trapz(my, mx)
+    assert_almost_equal(mr, r)
+
+
+def test_ediff1d_matrix():
+    # 2018-04-29: moved here from core.tests.test_arraysetops.
+    assert(isinstance(np.ediff1d(np.matrix(1)), np.matrix))
+    assert(isinstance(np.ediff1d(np.matrix(1), to_begin=1), np.matrix))
+
+
+def test_apply_along_axis_matrix():
+    # this test is particularly malicious because matrix
+    # refuses to become 1d
+    # 2018-04-29: moved here from core.tests.test_shape_base.
+    def double(row):
+        return row * 2
+
+    m = np.matrix([[0, 1], [2, 3]])
+    expected = np.matrix([[0, 2], [4, 6]])
+
+    result = np.apply_along_axis(double, 0, m)
+    assert_(isinstance(result, np.matrix))
+    assert_array_equal(result, expected)
+
+    result = np.apply_along_axis(double, 1, m)
+    assert_(isinstance(result, np.matrix))
+    assert_array_equal(result, expected)
+
+
+def test_kron_matrix():
+    # 2018-04-29: moved here from core.tests.test_shape_base.
+    a = np.ones([2, 2])
+    m = np.asmatrix(a)
+    assert_equal(type(np.kron(a, a)), np.ndarray)
+    assert_equal(type(np.kron(m, m)), np.matrix)
+    assert_equal(type(np.kron(a, m)), np.matrix)
+    assert_equal(type(np.kron(m, a)), np.matrix)
+
+
+class TestConcatenatorMatrix(object):
+    # 2018-04-29: moved here from core.tests.test_index_tricks.
+    def test_matrix(self):
+        a = [1, 2]
+        b = [3, 4]
+
+        ab_r = np.r_['r', a, b]
+        ab_c = np.r_['c', a, b]
+
+        assert_equal(type(ab_r), np.matrix)
+        assert_equal(type(ab_c), np.matrix)
+
+        assert_equal(np.array(ab_r), [[1, 2, 3, 4]])
+        assert_equal(np.array(ab_c), [[1], [2], [3], [4]])
+
+        assert_raises(ValueError, lambda: np.r_['rc', a, b])
+
+    def test_matrix_scalar(self):
+        r = np.r_['r', [1, 2], 3]
+        assert_equal(type(r), np.matrix)
+        assert_equal(np.array(r), [[1, 2, 3]])
+
+    def test_matrix_builder(self):
+        a = np.array([1])
+        b = np.array([2])
+        c = np.array([3])
+        d = np.array([4])
+        actual = np.r_['a, b; c, d']
+        expected = np.bmat([[a, b], [c, d]])
+
+        assert_equal(actual, expected)
+        assert_equal(type(actual), type(expected))
+
+
+def test_array_equal_error_message_matrix():
+    # 2018-04-29: moved here from testing.tests.test_utils.
+    try:
+        assert_equal(np.array([1, 2]), np.matrix([1, 2]))
+    except AssertionError as e:
+        msg = str(e)
+        msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
+        msg_reference = textwrap.dedent("""\
+
+        Arrays are not equal
+
+        (shapes (2,), (1, 2) mismatch)
+         x: array([1, 2])
+         y: matrix([[1, 2]])""")
+        try:
+            assert_equal(msg, msg_reference)
+        except AssertionError:
+            assert_equal(msg2, msg_reference)
+    else:
+        raise AssertionError("Did not raise")
+
+
+def test_array_almost_equal_matrix():
+    # Matrix slicing keeps things 2-D, while array does not necessarily.
+    # See gh-8452.
+    # 2018-04-29: moved here from testing.tests.test_utils.
+    m1 = np.matrix([[1., 2.]])
+    m2 = np.matrix([[1., np.nan]])
+    m3 = np.matrix([[1., -np.inf]])
+    m4 = np.matrix([[np.nan, np.inf]])
+    m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
+    for assert_func in assert_array_almost_equal, assert_almost_equal:
+        for m in m1, m2, m3, m4, m5:
+            assert_func(m, m)
+            a = np.array(m)
+            assert_func(a, m)
+            assert_func(m, a)
diff --git a/numpy/matrixlib/tests/test_masked_matrix.py b/numpy/matrixlib/tests/test_masked_matrix.py
index 80d1cacca..adc2e5419 100644
--- a/numpy/matrixlib/tests/test_masked_matrix.py
+++ b/numpy/matrixlib/tests/test_masked_matrix.py
@@ -1,12 +1,22 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import pickle
 
 import numpy as np
-from numpy.ma.testutils import assert_, assert_equal
+from numpy.ma.testutils import (assert_, assert_equal, assert_raises,
+                                assert_array_equal)
 from numpy.ma.core import (masked_array, masked_values, masked, allequal,
                            MaskType, getmask, MaskedArray, nomask,
                            log, add, hypot, divide)
+from numpy.ma.extras import mr_
 
 
 class MMatrix(MaskedArray, np.matrix,):
@@ -209,3 +219,21 @@ class TestSubclassing(object):
         assert_(isinstance(divide(mx, mx), MMatrix))
         assert_(isinstance(divide(mx, x), MMatrix))
         assert_equal(divide(mx, mx), divide(xmx, xmx))
+
+class TestConcatenator(object):
+    # Tests for mr_, the equivalent of r_ for masked arrays.
+
+    def test_matrix_builder(self):
+        assert_raises(np.ma.MAError, lambda: mr_['1, 2; 3, 4'])
+
+    def test_matrix(self):
+        # Test consistency with unmasked version.  If we ever deprecate
+        # matrix, this test should either still pass, or both actual and
+        # expected should fail to be build.
+        actual = mr_['r', 1, 2, 3]
+        expected = np.ma.array(np.r_['r', 1, 2, 3])
+        assert_array_equal(actual, expected)
+
+        # outer type is masked array, inner type is matrix
+        assert_equal(type(actual), type(expected))
+        assert_equal(type(actual.data), type(expected.data))
diff --git a/numpy/matrixlib/tests/test_matrix_linalg.py b/numpy/matrixlib/tests/test_matrix_linalg.py
index 6fc733c2e..85c7693b4 100644
--- a/numpy/matrixlib/tests/test_matrix_linalg.py
+++ b/numpy/matrixlib/tests/test_matrix_linalg.py
@@ -1,6 +1,14 @@
 """ Test functions for linalg module using the matrix class."""
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 
 from numpy.linalg.tests.test_linalg import (
diff --git a/numpy/matrixlib/tests/test_multiarray.py b/numpy/matrixlib/tests/test_multiarray.py
index 6d84bd477..2f04b49d6 100644
--- a/numpy/matrixlib/tests/test_multiarray.py
+++ b/numpy/matrixlib/tests/test_multiarray.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 from numpy.testing import assert_, assert_equal, assert_array_equal
 
diff --git a/numpy/matrixlib/tests/test_numeric.py b/numpy/matrixlib/tests/test_numeric.py
index 95e1c8001..cfdada126 100644
--- a/numpy/matrixlib/tests/test_numeric.py
+++ b/numpy/matrixlib/tests/test_numeric.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 from numpy.testing import assert_equal
 
diff --git a/numpy/matrixlib/tests/test_regression.py b/numpy/matrixlib/tests/test_regression.py
index 70e147279..439704ccf 100644
--- a/numpy/matrixlib/tests/test_regression.py
+++ b/numpy/matrixlib/tests/test_regression.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 from numpy.testing import assert_, assert_equal, assert_raises
 
diff --git a/numpy/polynomial/tests/test_polyutils.py b/numpy/polynomial/tests/test_polyutils.py
index 32ea55716..801c558cc 100644
--- a/numpy/polynomial/tests/test_polyutils.py
+++ b/numpy/polynomial/tests/test_polyutils.py
@@ -63,7 +63,7 @@ class TestDomain(object):
         dom1 = [0, 4]
         dom2 = [1, 3]
         tgt = dom2
-        res = pu. mapdomain(dom1, dom1, dom2)
+        res = pu.mapdomain(dom1, dom1, dom2)
         assert_almost_equal(res, tgt)
 
         # test for complex values
@@ -83,11 +83,14 @@ class TestDomain(object):
         assert_almost_equal(res, tgt)
 
         # test that subtypes are preserved.
+        class MyNDArray(np.ndarray):
+            pass
+
         dom1 = [0, 4]
         dom2 = [1, 3]
-        x = np.matrix([dom1, dom1])
+        x = np.array([dom1, dom1]).view(MyNDArray)
         res = pu.mapdomain(x, dom1, dom2)
-        assert_(isinstance(res, np.matrix))
+        assert_(isinstance(res, MyNDArray))
 
     def test_mapparms(self):
         # test for real values
diff --git a/numpy/testing/_private/utils.py b/numpy/testing/_private/utils.py
index b0c0b0c48..a7935f175 100644
--- a/numpy/testing/_private/utils.py
+++ b/numpy/testing/_private/utils.py
@@ -771,7 +771,11 @@ def assert_array_compare(comparison, x, y, err_msg='', verbose=True,
             reduced = val.ravel()
             cond = reduced.all()
             reduced = reduced.tolist()
-        if not cond:
+        # The below comparison is a hack to ensure that fully masked
+        # results, for which val.ravel().all() returns np.ma.masked,
+        # do not trigger a failure (np.ma.masked != True evaluates as
+        # np.ma.masked, which is falsy).
+        if cond != True:
             match = 100-100.0*reduced.count(1)/len(reduced)
             msg = build_err_msg([x, y],
                                 err_msg
@@ -1369,16 +1373,20 @@ def _assert_valid_refcount(op):
     """
     if not HAS_REFCOUNT:
         return True
-    import numpy as np
+    import numpy as np, gc
 
     b = np.arange(100*100).reshape(100, 100)
     c = b
     i = 1
 
-    rc = sys.getrefcount(i)
-    for j in range(15):
-        d = op(b, c)
-    assert_(sys.getrefcount(i) >= rc)
+    gc.disable()
+    try:
+        rc = sys.getrefcount(i)
+        for j in range(15):
+            d = op(b, c)
+        assert_(sys.getrefcount(i) >= rc)
+    finally:
+        gc.enable()
     del d  # for pyflakes
 
 
diff --git a/numpy/testing/tests/test_utils.py b/numpy/testing/tests/test_utils.py
index 0592e62f8..602cdf5f2 100644
--- a/numpy/testing/tests/test_utils.py
+++ b/numpy/testing/tests/test_utils.py
@@ -286,7 +286,7 @@ class TestEqual(TestArrayEqual):
 
     def test_error_message(self):
         try:
-            self._assert_func(np.array([1, 2]), np.matrix([1, 2]))
+            self._assert_func(np.array([1, 2]), np.array([[1, 2]]))
         except AssertionError as e:
             msg = str(e)
             msg2 = msg.replace("shapes (2L,), (1L, 2L)", "shapes (2,), (1, 2)")
@@ -296,7 +296,7 @@ class TestEqual(TestArrayEqual):
 
             (shapes (2,), (1, 2) mismatch)
              x: array([1, 2])
-             y: matrix([[1, 2]])""")
+             y: array([[1, 2]])""")
             try:
                 assert_equal(msg, msg_reference)
             except AssertionError:
@@ -366,19 +366,23 @@ class TestArrayAlmostEqual(_GenericTest):
         self._assert_func(b, a)
         self._assert_func(b, b)
 
-    def test_matrix(self):
-        # Matrix slicing keeps things 2-D, while array does not necessarily.
-        # See gh-8452.
-        m1 = np.matrix([[1., 2.]])
-        m2 = np.matrix([[1., np.nan]])
-        m3 = np.matrix([[1., -np.inf]])
-        m4 = np.matrix([[np.nan, np.inf]])
-        m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
-        for m in m1, m2, m3, m4, m5:
-            self._assert_func(m, m)
-            a = np.array(m)
-            self._assert_func(a, m)
-            self._assert_func(m, a)
+        # Test fully masked as well (see gh-11123).
+        a = np.ma.MaskedArray(3.5, mask=True)
+        b = np.array([3., 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.masked
+        b = np.array([3., 4., 6.5])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, True, True])
+        b = np.array([1., 2., 3.])
+        self._test_equal(a, b)
+        self._test_equal(b, a)
+        a = np.ma.MaskedArray([3., 4., 6.5], mask=[True, True, True])
+        b = np.array(1.)
+        self._test_equal(a, b)
+        self._test_equal(b, a)
 
     def test_subclass_that_cannot_be_bool(self):
         # While we cannot guarantee testing functions will always work for
@@ -479,20 +483,6 @@ class TestAlmostEqual(_GenericTest):
             # remove anything that's not the array string
             assert_equal(str(e).split('%)\n ')[1], b)
 
-    def test_matrix(self):
-        # Matrix slicing keeps things 2-D, while array does not necessarily.
-        # See gh-8452.
-        m1 = np.matrix([[1., 2.]])
-        m2 = np.matrix([[1., np.nan]])
-        m3 = np.matrix([[1., -np.inf]])
-        m4 = np.matrix([[np.nan, np.inf]])
-        m5 = np.matrix([[1., 2.], [np.nan, np.inf]])
-        for m in m1, m2, m3, m4, m5:
-            self._assert_func(m, m)
-            a = np.array(m)
-            self._assert_func(a, m)
-            self._assert_func(m, a)
-
     def test_subclass_that_cannot_be_bool(self):
         # While we cannot guarantee testing functions will always work for
         # subclasses, the tests should ideally rely only on subclasses having
diff --git a/numpy/tests/test_matlib.py b/numpy/tests/test_matlib.py
index 12116b883..38a7e39df 100644
--- a/numpy/tests/test_matlib.py
+++ b/numpy/tests/test_matlib.py
@@ -1,5 +1,13 @@
 from __future__ import division, absolute_import, print_function
 
+# As we are testing matrices, we ignore its PendingDeprecationWarnings
+try:
+    import pytest
+    pytestmark = pytest.mark.filterwarnings(
+        'ignore:the matrix subclass is not:PendingDeprecationWarning')
+except ImportError:
+    pass
+
 import numpy as np
 import numpy.matlib
 from numpy.testing import assert_array_equal, assert_