29 files changed, 1461 insertions, 297 deletions
diff --git a/numpy/add_newdocs.py b/numpy/add_newdocs.py
index 55bcf8ee1..11a2688e5 100644
--- a/numpy/add_newdocs.py
+++ b/numpy/add_newdocs.py
@@ -1943,6 +1943,7 @@ add_newdoc('numpy.core', 'dot',
     vdot : Complex-conjugating dot product.
     tensordot : Sum products over arbitrary axes.
     einsum : Einstein summation convention.
+    matmul : '@' operator as method with out parameter.
 
     Examples
     --------
@@ -1954,7 +1955,7 @@ add_newdoc('numpy.core', 'dot',
     >>> np.dot([2j, 3j], [2j, 3j])
     (-13+0j)
 
-    For 2-D arrays it's the matrix product:
+    For 2-D arrays it is the matrix product:
 
     >>> a = [[1, 0], [0, 1]]
     >>> b = [[4, 1], [2, 2]]
@@ -1971,6 +1972,130 @@ add_newdoc('numpy.core', 'dot',
 
     """)
 
+add_newdoc('numpy.core', 'matmul',
+    """
+    matmul(a, b, out=None)
+
+    Matrix product of two arrays.
+
+    The behavior depends on the arguments in the following way.
+
+    - If both arguments are 2-D they are multiplied like conventional
+      matrices.
+    - If either argument is N-D, N > 2, it is treated as a stack of
+      matrices residing in the last two indexes and broadcast accordingly.
+    - If the first argument is 1-D, it is promoted to a matrix by
+      prepending a 1 to its dimensions. After matrix multiplication
+      the prepended 1 is removed.
+    - If the second argument is 1-D, it is promoted to a matrix by
+      appending a 1 to its dimensions. After matrix multiplication
+      the appended 1 is removed.
+
+    Multiplication by a scalar is not allowed, use ``*`` instead. Note that
+    multiplying a stack of matrices with a vector will result in a stack of
+    vectors, but matmul will not recognize it as such.
+
+    ``matmul`` differs from ``dot`` in two important ways.
+
+    - Multiplication by scalars is not allowed.
+    - Stacks of matrices are broadcast together as if the matrices
+      were elements.
+
+    .. warning::
+       This function is preliminary and included in Numpy 1.10 for testing
+       and documentation. Its semantics will not change, but the number and
+       order of the optional arguments will.
+
+    .. versionadded:: 1.10.0
+
+    Parameters
+    ----------
+    a : array_like
+        First argument.
+    b : array_like
+        Second argument.
+    out : ndarray, optional
+        Output argument. This must have the exact kind that would be returned
+        if it was not used. In particular, it must have the right type, must be
+        C-contiguous, and its dtype must be the dtype that would be returned
+        for `dot(a,b)`. This is a performance feature. Therefore, if these
+        conditions are not met, an exception is raised, instead of attempting
+        to be flexible.
+
+    Returns
+    -------
+    output : ndarray
+        Returns the dot product of `a` and `b`.  If `a` and `b` are both
+        1-D arrays then a scalar is returned; otherwise an array is
+        returned.  If `out` is given, then it is returned.
+
+    Raises
+    ------
+    ValueError
+        If the last dimension of `a` is not the same size as
+        the second-to-last dimension of `b`.
+
+        If scalar value is passed.
+
+    See Also
+    --------
+    vdot : Complex-conjugating dot product.
+    tensordot : Sum products over arbitrary axes.
+    einsum : Einstein summation convention.
+    dot : alternative matrix product with different broadcasting rules.
+
+    Notes
+    -----
+    The matmul function implements the semantics of the `@` operator introduced
+    in Python 3.5 following PEP465.
+
+    Examples
+    --------
+    For 2-D arrays it is the matrix product:
+
+    >>> a = [[1, 0], [0, 1]]
+    >>> b = [[4, 1], [2, 2]]
+    >>> np.matmul(a, b)
+    array([[4, 1],
+           [2, 2]])
+
+    For 2-D mixed with 1-D, the result is the usual.
+
+    >>> a = [[1, 0], [0, 1]]
+    >>> b = [1, 2]
+    >>> np.matmul(a, b)
+    array([1, 2])
+    >>> np.matmul(b, a)
+    array([1, 2])
+
+
+    Broadcasting is conventional for stacks of arrays
+
+    >>> a = np.arange(2*2*4).reshape((2,2,4))
+    >>> b = np.arange(2*2*4).reshape((2,4,2))
+    >>> np.matmul(a,b).shape
+    (2, 2, 2)
+    >>> np.matmul(a,b)[0,1,1]
+    98
+    >>> sum(a[0,1,:] * b[0,:,1])
+    98
+
+    Vector, vector returns the scalar inner product, but neither argument
+    is complex-conjugated:
+
+    >>> np.matmul([2j, 3j], [2j, 3j])
+    (-13+0j)
+
+    Scalar multiplication raises an error.
+
+    >>> np.matmul([1,2], 3)
+    Traceback (most recent call last):
+    ...
+    ValueError: Scalar operands are not allowed, use '*' instead
+
+    """)
+
+
 add_newdoc('numpy.core', 'einsum',
     """
     einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe')
@@ -4191,10 +4316,12 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('sort',
         last axis.
     kind : {'quicksort', 'mergesort', 'heapsort'}, optional
         Sorting algorithm. Default is 'quicksort'.
-    order : list, optional
+    order : str or list of str, optional
         When `a` is an array with fields defined, this argument specifies
-        which fields to compare first, second, etc.  Not all fields need be
-        specified.
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
 
     See Also
     --------
@@ -4258,10 +4385,12 @@ add_newdoc('numpy.core.multiarray', 'ndarray', ('partition',
         last axis.
     kind : {'introselect'}, optional
         Selection algorithm. Default is 'introselect'.
-    order : list, optional
+    order : str or list of str, optional
         When `a` is an array with fields defined, this argument specifies
-        which fields to compare first, second, etc.  Not all fields need be
-        specified.
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
 
     See Also
     --------
diff --git a/numpy/core/_internal.py b/numpy/core/_internal.py
index d32f59390..e80c22dfe 100644
--- a/numpy/core/_internal.py
+++ b/numpy/core/_internal.py
@@ -305,6 +305,174 @@ def _index_fields(ary, fields):
     copy_dtype = {'names':view_dtype['names'], 'formats':view_dtype['formats']}
     return array(view, dtype=copy_dtype, copy=True)
 
+def _get_all_field_offsets(dtype, base_offset=0):
+    """ Returns the types and offsets of all fields in a (possibly structured)
+    data type, including nested fields and subarrays.
+
+    Parameters
+    ----------
+    dtype : data-type
+        Data type to extract fields from.
+    base_offset : int, optional
+        Additional offset to add to all field offsets.
+
+    Returns
+    -------
+    fields : list of (data-type, int) pairs
+        A flat list of (dtype, byte offset) pairs.
+
+    """
+    fields = []
+    if dtype.fields is not None:
+        for name in dtype.names:
+           sub_dtype = dtype.fields[name][0]
+           sub_offset = dtype.fields[name][1] + base_offset
+           fields.extend(_get_all_field_offsets(sub_dtype, sub_offset))
+    else:
+        if dtype.shape:
+            sub_offsets = _get_all_field_offsets(dtype.base, base_offset)
+            count = 1
+            for dim in dtype.shape:
+                count *= dim
+            fields.extend((typ, off + dtype.base.itemsize*j)
+                           for j in range(count) for (typ, off) in sub_offsets)
+        else:
+            fields.append((dtype, base_offset))
+    return fields
+
+def _check_field_overlap(new_fields, old_fields):
+    """ Perform object memory overlap tests for two data-types (see
+    _view_is_safe).
+
+    This function checks that new fields only access memory contained in old
+    fields, and that non-object fields are not interpreted as objects and vice
+    versa.
+
+    Parameters
+    ----------
+    new_fields : list of (data-type, int) pairs
+        Flat list of (dtype, byte offset) pairs for the new data type, as
+        returned by _get_all_field_offsets.
+    old_fields: list of (data-type, int) pairs
+        Flat list of (dtype, byte offset) pairs for the old data type, as
+        returned by _get_all_field_offsets.
+
+    Raises
+    ------
+    TypeError
+        If the new fields are incompatible with the old fields
+
+    """
+    from .numerictypes import object_
+    from .multiarray import dtype
+
+    #first go byte by byte and check we do not access bytes not in old_fields
+    new_bytes = set()
+    for tp, off in new_fields:
+        new_bytes.update(set(range(off, off+tp.itemsize)))
+    old_bytes = set()
+    for tp, off in old_fields:
+        old_bytes.update(set(range(off, off+tp.itemsize)))
+    if new_bytes.difference(old_bytes):
+        raise TypeError("view would access data parent array doesn't own")
+
+    #next check that we do not interpret non-Objects as Objects, and vv
+    obj_offsets = [off for (tp, off) in old_fields if tp.type is object_]
+    obj_size = dtype(object_).itemsize
+
+    for fld_dtype, fld_offset in new_fields:
+        if fld_dtype.type is object_:
+            # check we do not create object views where
+            # there are no objects.
+            if fld_offset not in obj_offsets:
+                raise TypeError("cannot view non-Object data as Object type")
+        else:
+            # next check we do not create non-object views
+            # where there are already objects.
+            # see validate_object_field_overlap for a similar computation.
+            for obj_offset in obj_offsets:
+                if (fld_offset < obj_offset + obj_size and
+                        obj_offset < fld_offset + fld_dtype.itemsize):
+                    raise TypeError("cannot view Object as non-Object type")
+
+def _getfield_is_safe(oldtype, newtype, offset):
+    """ Checks safety of getfield for object arrays.
+
+    As in _view_is_safe, we need to check that memory containing objects is not
+    reinterpreted as a non-object datatype and vice versa.
+
+    Parameters
+    ----------
+    oldtype : data-type
+        Data type of the original ndarray.
+    newtype : data-type
+        Data type of the field being accessed by ndarray.getfield
+    offset : int
+        Offset of the field being accessed by ndarray.getfield
+
+    Raises
+    ------
+    TypeError
+        If the field access is invalid
+
+    """
+    new_fields = _get_all_field_offsets(newtype, offset)
+    old_fields = _get_all_field_offsets(oldtype)
+    # raises if there is a problem
+    _check_field_overlap(new_fields, old_fields)
+
+def _view_is_safe(oldtype, newtype):
+    """ Checks safety of a view involving object arrays, for example when
+    doing::
+
+        np.zeros(10, dtype=oldtype).view(newtype)
+
+    We need to check that
+    1) No memory that is not an object will be interpreted as a object,
+    2) No memory containing an object will be interpreted as an arbitrary type.
+    Both cases can cause segfaults, eg in the case the view is written to.
+    Strategy here is to also disallow views where newtype has any field in a
+    place oldtype doesn't.
+
+    Parameters
+    ----------
+    oldtype : data-type
+        Data type of original ndarray
+    newtype : data-type
+        Data type of the view
+
+    Raises
+    ------
+    TypeError
+        If the new type is incompatible with the old type.
+
+    """
+    new_fields = _get_all_field_offsets(newtype)
+    new_size = newtype.itemsize
+
+    old_fields = _get_all_field_offsets(oldtype)
+    old_size = oldtype.itemsize
+
+    # if the itemsizes are not equal, we need to check that all the
+    # 'tiled positions' of the object match up. Here, we allow
+    # for arbirary itemsizes (even those possibly disallowed
+    # due to stride/data length issues).
+    if old_size ==  new_size:
+        new_num = old_num = 1
+    else:
+        gcd_new_old = _gcd(new_size, old_size)
+        new_num = old_size // gcd_new_old
+        old_num = new_size // gcd_new_old
+
+    # get position of fields within the tiling
+    new_fieldtile = [(tp, off + new_size*j)
+                     for j in range(new_num) for (tp, off) in new_fields]
+    old_fieldtile = [(tp, off + old_size*j)
+                     for j in range(old_num) for (tp, off) in old_fields]
+
+    # raises if there is a problem
+    _check_field_overlap(new_fieldtile, old_fieldtile)
+
 # Given a string containing a PEP 3118 format specifier,
 # construct a Numpy dtype
 
diff --git a/numpy/core/fromnumeric.py b/numpy/core/fromnumeric.py
index 778cef204..3741d821d 100644
--- a/numpy/core/fromnumeric.py
+++ b/numpy/core/fromnumeric.py
@@ -579,10 +579,12 @@ def partition(a, kth, axis=-1, kind='introselect', order=None):
         sorting. The default is -1, which sorts along the last axis.
     kind : {'introselect'}, optional
         Selection algorithm. Default is 'introselect'.
-    order : list, optional
-        When `a` is a structured array, this argument specifies which fields
-        to compare first, second, and so on.  This list does not need to
-        include all of the fields.
+    order : str or list of str, optional
+        When `a` is an array with fields defined, this argument specifies
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string.  Not all fields need be specified, but
+        unspecified fields will still be used, in the order in which they
+        come up in the dtype, to break ties.
 
     Returns
     -------
@@ -662,10 +664,12 @@ def argpartition(a, kth, axis=-1, kind='introselect', order=None):
         the flattened array is used.
     kind : {'introselect'}, optional
         Selection algorithm. Default is 'introselect'
-    order : list, optional
+    order : str or list of str, optional
         When `a` is an array with fields defined, this argument specifies
-        which fields to compare first, second, etc.  Not all fields need be
-        specified.
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
 
     Returns
     -------
@@ -718,10 +722,12 @@ def sort(a, axis=-1, kind='quicksort', order=None):
         sorting. The default is -1, which sorts along the last axis.
     kind : {'quicksort', 'mergesort', 'heapsort'}, optional
         Sorting algorithm. Default is 'quicksort'.
-    order : list, optional
-        When `a` is a structured array, this argument specifies which fields
-        to compare first, second, and so on.  This list does not need to
-        include all of the fields.
+    order : str or list of str, optional
+        When `a` is an array with fields defined, this argument specifies
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
 
     Returns
     -------
@@ -831,10 +837,12 @@ def argsort(a, axis=-1, kind='quicksort', order=None):
         the flattened array is used.
     kind : {'quicksort', 'mergesort', 'heapsort'}, optional
         Sorting algorithm.
-    order : list, optional
+    order : str or list of str, optional
         When `a` is an array with fields defined, this argument specifies
-        which fields to compare first, second, etc.  Not all fields need be
-        specified.
+        which fields to compare first, second, etc.  A single field can
+        be specified as a string, and not all fields need be specified,
+        but unspecified fields will still be used, in the order in which
+        they come up in the dtype, to break ties.
 
     Returns
     -------
@@ -2939,16 +2947,16 @@ def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
 
     In single precision, std() can be inaccurate:
 
-    >>> a = np.zeros((2,512*512), dtype=np.float32)
-    >>> a[0,:] = 1.0
-    >>> a[1,:] = 0.1
+    >>> a = np.zeros((2, 512*512), dtype=np.float32)
+    >>> a[0, :] = 1.0
+    >>> a[1, :] = 0.1
     >>> np.std(a)
-    0.45172946707416706
+    0.45000005
 
     Computing the standard deviation in float64 is more accurate:
 
     >>> np.std(a, dtype=np.float64)
-    0.44999999925552653
+    0.44999999925494177
 
     """
     if type(a) is not mu.ndarray:
@@ -3035,7 +3043,7 @@ def var(a, axis=None, dtype=None, out=None, ddof=0,
 
     Examples
     --------
-    >>> a = np.array([[1,2],[3,4]])
+    >>> a = np.array([[1, 2], [3, 4]])
     >>> np.var(a)
     1.25
     >>> np.var(a, axis=0)
@@ -3045,18 +3053,18 @@ def var(a, axis=None, dtype=None, out=None, ddof=0,
 
     In single precision, var() can be inaccurate:
 
-    >>> a = np.zeros((2,512*512), dtype=np.float32)
-    >>> a[0,:] = 1.0
-    >>> a[1,:] = 0.1
+    >>> a = np.zeros((2, 512*512), dtype=np.float32)
+    >>> a[0, :] = 1.0
+    >>> a[1, :] = 0.1
     >>> np.var(a)
-    0.20405951142311096
+    0.20250003
 
     Computing the variance in float64 is more accurate:
 
     >>> np.var(a, dtype=np.float64)
-    0.20249999932997387
+    0.20249999932944759
     >>> ((1-0.55)**2 + (0.1-0.55)**2)/2
-    0.20250000000000001
+    0.2025
 
     """
     if type(a) is not mu.ndarray:
diff --git a/numpy/core/include/numpy/ndarraytypes.h b/numpy/core/include/numpy/ndarraytypes.h
index edae27c72..c11e2505a 100644
--- a/numpy/core/include/numpy/ndarraytypes.h
+++ b/numpy/core/include/numpy/ndarraytypes.h
@@ -1099,27 +1099,6 @@ struct PyArrayIterObject_tag {
         } \
 } while (0)
 
-#define _PyArray_ITER_NEXT3(it) do { \
-        if ((it)->coordinates[2] < (it)->dims_m1[2]) { \
-                (it)->coordinates[2]++; \
-                (it)->dataptr += (it)->strides[2]; \
-        } \
-        else { \
-                (it)->coordinates[2] = 0; \
-                (it)->dataptr -= (it)->backstrides[2]; \
-                if ((it)->coordinates[1] < (it)->dims_m1[1]) { \
-                        (it)->coordinates[1]++; \
-                        (it)->dataptr += (it)->strides[1]; \
-                } \
-                else { \
-                        (it)->coordinates[1] = 0; \
-                        (it)->coordinates[0]++; \
-                        (it)->dataptr += (it)->strides[0] \
-                                (it)->backstrides[1]; \
-                } \
-        } \
-} while (0)
-
 #define PyArray_ITER_NEXT(it) do { \
         _PyAIT(it)->index++; \
         if (_PyAIT(it)->nd_m1 == 0) { \
diff --git a/numpy/core/numeric.py b/numpy/core/numeric.py
index ea2d4d0a2..bf22f6954 100644
--- a/numpy/core/numeric.py
+++ b/numpy/core/numeric.py
@@ -43,7 +43,8 @@ __all__ = ['newaxis', 'ndarray', 'flatiter', 'nditer', 'nested_iters', 'ufunc',
            'Inf', 'inf', 'infty', 'Infinity',
            'nan', 'NaN', 'False_', 'True_', 'bitwise_not',
            'CLIP', 'RAISE', 'WRAP', 'MAXDIMS', 'BUFSIZE', 'ALLOW_THREADS',
-           'ComplexWarning', 'may_share_memory', 'full', 'full_like']
+           'ComplexWarning', 'may_share_memory', 'full', 'full_like',
+           'matmul']
 
 if sys.version_info[0] < 3:
     __all__.extend(['getbuffer', 'newbuffer'])
@@ -390,6 +391,11 @@ lexsort = multiarray.lexsort
 compare_chararrays = multiarray.compare_chararrays
 putmask = multiarray.putmask
 einsum = multiarray.einsum
+dot = multiarray.dot
+inner = multiarray.inner
+vdot = multiarray.vdot
+matmul = multiarray.matmul
+
 
 def asarray(a, dtype=None, order=None):
     """
@@ -1081,11 +1087,6 @@ def outer(a, b, out=None):
     b = asarray(b)
     return multiply(a.ravel()[:, newaxis], b.ravel()[newaxis,:], out)
 
-# try to import blas optimized dot if available
-envbak = os.environ.copy()
-dot = multiarray.dot
-inner = multiarray.inner
-vdot = multiarray.vdot
 
 def alterdot():
     """
diff --git a/numpy/core/records.py b/numpy/core/records.py
index 243645436..b1ea176e4 100644
--- a/numpy/core/records.py
+++ b/numpy/core/records.py
@@ -245,13 +245,12 @@ class record(nt.void):
                 #happens if field is Object type
                 return obj
             if dt.fields:
-                return obj.view((record, obj.dtype.descr))
+                return obj.view((self.__class__, obj.dtype.fields))
             return obj
         else:
             raise AttributeError("'record' object has no "
                     "attribute '%s'" % attr)
 
-
     def __setattr__(self, attr, val):
         if attr in ['setfield', 'getfield', 'dtype']:
             raise AttributeError("Cannot set '%s' attribute" % attr)
@@ -266,6 +265,16 @@ class record(nt.void):
                 raise AttributeError("'record' object has no "
                         "attribute '%s'" % attr)
 
+    def __getitem__(self, indx):
+        obj = nt.void.__getitem__(self, indx)
+
+        # copy behavior of record.__getattribute__,
+        if isinstance(obj, nt.void) and obj.dtype.fields:
+            return obj.view((self.__class__, obj.dtype.fields))
+        else:
+            # return a single element
+            return obj
+
     def pprint(self):
         """Pretty-print all fields."""
         # pretty-print all fields
@@ -438,7 +447,7 @@ class recarray(ndarray):
         # to preserve numpy.record type if present), since nested structured
         # fields do not inherit type.
         if obj.dtype.fields:
-            return obj.view(dtype=(self.dtype.type, obj.dtype.descr))
+            return obj.view(dtype=(self.dtype.type, obj.dtype.fields))
         else:
             return obj.view(ndarray)
 
@@ -478,7 +487,7 @@ class recarray(ndarray):
         # we might also be returning a single element
         if isinstance(obj, ndarray):
             if obj.dtype.fields:
-                return obj.view(dtype=(self.dtype.type, obj.dtype.descr))
+                return obj.view(dtype=(self.dtype.type, obj.dtype.fields))
             else:
                 return obj.view(type=ndarray)
         else:
diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index fb467cb78..c19d31a0d 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -2689,8 +2689,10 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     PyArray_Descr *descr;
     PyObject *names, *key;
     PyObject *tup;
+    PyArrayObject_fields dummy_struct;
+    PyArrayObject *dummy = (PyArrayObject *)&dummy_struct;
     char *nip1, *nip2;
-    int i, res = 0, swap=0;
+    int i, res = 0, swap = 0;
 
     if (!PyArray_HASFIELDS(ap)) {
         return STRING_compare(ip1, ip2, ap);
@@ -2702,34 +2704,29 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
      */
     names = descr->names;
     for (i = 0; i < PyTuple_GET_SIZE(names); i++) {
-        PyArray_Descr * new;
+        PyArray_Descr *new;
         npy_intp offset;
         key = PyTuple_GET_ITEM(names, i);
         tup = PyDict_GetItem(descr->fields, key);
         if (unpack_field(tup, &new, &offset) < 0) {
             goto finish;
         }
-        /*
-         * TODO: temporarily modifying the array like this
-         *       is bad coding style, should be changed.
-         */
-        ((PyArrayObject_fields *)ap)->descr = new;
-        swap = PyArray_ISBYTESWAPPED(ap);
+        /* descr is the only field checked by compare or copyswap */
+        dummy_struct.descr = new;
+        swap = PyArray_ISBYTESWAPPED(dummy);
         nip1 = ip1 + offset;
         nip2 = ip2 + offset;
-        if ((swap) || (new->alignment > 1)) {
-            if ((swap) || (!npy_is_aligned(nip1, new->alignment))) {
+        if (swap || new->alignment > 1) {
+            if (swap || !npy_is_aligned(nip1, new->alignment)) {
                 /* create buffer and copy */
                 nip1 = npy_alloc_cache(new->elsize);
                 if (nip1 == NULL) {
                     goto finish;
                 }
-                memcpy(nip1, ip1+offset, new->elsize);
-                if (swap)
-                    new->f->copyswap(nip1, NULL, swap, ap);
+                new->f->copyswap(nip1, ip1 + offset, swap, dummy);
             }
-            if ((swap) || (!npy_is_aligned(nip2, new->alignment))) {
-                /* copy data to a buffer */
+            if (swap || !npy_is_aligned(nip2, new->alignment)) {
+                /* create buffer and copy */
                 nip2 = npy_alloc_cache(new->elsize);
                 if (nip2 == NULL) {
                     if (nip1 != ip1 + offset) {
@@ -2737,13 +2734,11 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
                     }
                     goto finish;
                 }
-                memcpy(nip2, ip2 + offset, new->elsize);
-                if (swap)
-                    new->f->copyswap(nip2, NULL, swap, ap);
+                new->f->copyswap(nip2, ip2 + offset, swap, dummy);
             }
         }
-        res = new->f->compare(nip1, nip2, ap);
-        if ((swap) || (new->alignment > 1)) {
+        res = new->f->compare(nip1, nip2, dummy);
+        if (swap || new->alignment > 1) {
             if (nip1 != ip1 + offset) {
                 npy_free_cache(nip1, new->elsize);
             }
@@ -2757,7 +2752,6 @@ VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
     }
 
 finish:
-    ((PyArrayObject_fields *)ap)->descr = descr;
     return res;
 }
 
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index 2bb45a6e0..13e172a0e 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -29,6 +29,8 @@
 #define NPY_NEXT_ALIGNED_OFFSET(offset, alignment) \
                 (((offset) + (alignment) - 1) & (-(alignment)))
 
+#define PyDictProxy_Check(obj) (Py_TYPE(obj) == &PyDictProxy_Type)
+
 static PyObject *typeDict = NULL;   /* Must be explicitly loaded */
 
 static PyArray_Descr *
@@ -270,7 +272,7 @@ _convert_from_tuple(PyObject *obj)
             type->elsize = itemsize;
         }
     }
-    else if (PyDict_Check(val)) {
+    else if (PyDict_Check(val) || PyDictProxy_Check(val)) {
         /* Assume it's a metadata dictionary */
         if (PyDict_Merge(type->metadata, val, 0) == -1) {
             Py_DECREF(type);
@@ -944,15 +946,21 @@ _convert_from_dict(PyObject *obj, int align)
     if (fields == NULL) {
         return (PyArray_Descr *)PyErr_NoMemory();
     }
-    names = PyDict_GetItemString(obj, "names");
-    descrs = PyDict_GetItemString(obj, "formats");
+    /* Use PyMapping_GetItemString to support dictproxy objects as well */
+    names = PyMapping_GetItemString(obj, "names");
+    descrs = PyMapping_GetItemString(obj, "formats");
     if (!names || !descrs) {
         Py_DECREF(fields);
+        PyErr_Clear();
         return _use_fields_dict(obj, align);
     }
     n = PyObject_Length(names);
-    offsets = PyDict_GetItemString(obj, "offsets");
-    titles = PyDict_GetItemString(obj, "titles");
+    offsets = PyMapping_GetItemString(obj, "offsets");
+    titles = PyMapping_GetItemString(obj, "titles");
+    if (!offsets || !titles) {
+        PyErr_Clear();
+    }
+
     if ((n > PyObject_Length(descrs))
         || (offsets && (n > PyObject_Length(offsets)))
         || (titles && (n > PyObject_Length(titles)))) {
@@ -966,8 +974,10 @@ _convert_from_dict(PyObject *obj, int align)
      * If a property 'aligned' is in the dict, it overrides the align flag
      * to be True if it not already true.
      */
-    tmp = PyDict_GetItemString(obj, "aligned");
-    if (tmp != NULL) {
+    tmp = PyMapping_GetItemString(obj, "aligned");
+    if (tmp == NULL) {
+        PyErr_Clear();
+    } else {
         if (tmp == Py_True) {
             align = 1;
         }
@@ -1138,8 +1148,10 @@ _convert_from_dict(PyObject *obj, int align)
     }
 
     /* Override the itemsize if provided */
-    tmp = PyDict_GetItemString(obj, "itemsize");
-    if (tmp != NULL) {
+    tmp = PyMapping_GetItemString(obj, "itemsize");
+    if (tmp == NULL) {
+        PyErr_Clear();
+    } else {
         itemsize = (int)PyInt_AsLong(tmp);
         if (itemsize == -1 && PyErr_Occurred()) {
             Py_DECREF(new);
@@ -1168,17 +1180,18 @@ _convert_from_dict(PyObject *obj, int align)
     }
 
     /* Add the metadata if provided */
-    metadata = PyDict_GetItemString(obj, "metadata");
+    metadata = PyMapping_GetItemString(obj, "metadata");
 
-    if (new->metadata == NULL) {
+    if (metadata == NULL) {
+        PyErr_Clear();
+    }
+    else if (new->metadata == NULL) {
         new->metadata = metadata;
         Py_XINCREF(new->metadata);
     }
-    else if (metadata != NULL) {
-        if (PyDict_Merge(new->metadata, metadata, 0) == -1) {
-            Py_DECREF(new);
-            return NULL;
-        }
+    else if (PyDict_Merge(new->metadata, metadata, 0) == -1) {
+        Py_DECREF(new);
+        return NULL;
     }
     return new;
 
@@ -1446,7 +1459,7 @@ PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at)
         }
         return NPY_SUCCEED;
     }
-    else if (PyDict_Check(obj)) {
+    else if (PyDict_Check(obj) || PyDictProxy_Check(obj)) {
         /* or a dictionary */
         *at = _convert_from_dict(obj,0);
         if (*at == NULL) {
@@ -2741,7 +2754,7 @@ arraydescr_setstate(PyArray_Descr *self, PyObject *args)
 NPY_NO_EXPORT int
 PyArray_DescrAlignConverter(PyObject *obj, PyArray_Descr **at)
 {
-    if (PyDict_Check(obj)) {
+    if (PyDict_Check(obj) || PyDictProxy_Check(obj)) {
         *at =  _convert_from_dict(obj, 1);
     }
     else if (PyBytes_Check(obj)) {
@@ -2777,7 +2790,7 @@ PyArray_DescrAlignConverter(PyObject *obj, PyArray_Descr **at)
 NPY_NO_EXPORT int
 PyArray_DescrAlignConverter2(PyObject *obj, PyArray_Descr **at)
 {
-    if (PyDict_Check(obj)) {
+    if (PyDict_Check(obj) || PyDictProxy_Check(obj)) {
         *at =  _convert_from_dict(obj, 1);
     }
     else if (PyBytes_Check(obj)) {
diff --git a/numpy/core/src/multiarray/getset.c b/numpy/core/src/multiarray/getset.c
index cbc798273..9ba12b092 100644
--- a/numpy/core/src/multiarray/getset.c
+++ b/numpy/core/src/multiarray/getset.c
@@ -9,8 +9,8 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
-
 #include "npy_pycompat.h"
+#include "npy_import.h"
 
 #include "common.h"
 #include "scalartypes.h"
@@ -434,6 +434,13 @@ array_descr_set(PyArrayObject *self, PyObject *arg)
     npy_intp newdim;
     int i;
     char *msg = "new type not compatible with array.";
+    PyObject *safe;
+    static PyObject *checkfunc = NULL;
+
+    npy_cache_pyfunc("numpy.core._internal", "_view_is_safe", &checkfunc);
+    if (checkfunc == NULL) {
+        return -1;
+    }
 
     if (arg == NULL) {
         PyErr_SetString(PyExc_AttributeError,
@@ -448,15 +455,13 @@ array_descr_set(PyArrayObject *self, PyObject *arg)
         return -1;
     }
 
-    if (PyDataType_FLAGCHK(newtype, NPY_ITEM_HASOBJECT) ||
-        PyDataType_FLAGCHK(newtype, NPY_ITEM_IS_POINTER) ||
-        PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_ITEM_HASOBJECT) ||
-        PyDataType_FLAGCHK(PyArray_DESCR(self), NPY_ITEM_IS_POINTER)) {
-        PyErr_SetString(PyExc_TypeError,
-                "Cannot change data-type for object array.");
+    /* check that we are not reinterpreting memory containing Objects */
+    safe = PyObject_CallFunction(checkfunc, "OO", PyArray_DESCR(self), newtype);
+    if (safe == NULL) {
         Py_DECREF(newtype);
         return -1;
     }
+    Py_DECREF(safe);
 
     if (newtype->elsize == 0) {
         /* Allow a void view */
diff --git a/numpy/core/src/multiarray/iterators.c b/numpy/core/src/multiarray/iterators.c
index e56237573..829994b1e 100644
--- a/numpy/core/src/multiarray/iterators.c
+++ b/numpy/core/src/multiarray/iterators.c
@@ -1577,7 +1577,8 @@ static PyObject *
 arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *kwds)
 {
 
-    Py_ssize_t n, i;
+    Py_ssize_t n = 0;
+    Py_ssize_t i, j, k;
     PyArrayMultiIterObject *multi;
     PyObject *arr;
 
@@ -1587,13 +1588,27 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
         return NULL;
     }
 
-    n = PyTuple_Size(args);
+    for (j = 0; j < PyTuple_Size(args); ++j) {
+        PyObject *obj = PyTuple_GET_ITEM(args, j);
+
+        if (PyObject_IsInstance(obj, (PyObject *)&PyArrayMultiIter_Type)) {
+            /*
+             * If obj is a multi-iterator, all its arrays will be added
+             * to the new multi-iterator.
+             */
+            n += ((PyArrayMultiIterObject *)obj)->numiter;
+        }
+        else {
+            /* If not, will try to convert it to a single array */
+            ++n;
+        }
+    }
     if (n < 2 || n > NPY_MAXARGS) {
         if (PyErr_Occurred()) {
             return NULL;
         }
         PyErr_Format(PyExc_ValueError,
-                     "Need at least two and fewer than (%d) "   \
+                     "Need at least two and fewer than (%d) "
                      "array objects.", NPY_MAXARGS);
         return NULL;
     }
@@ -1606,20 +1621,38 @@ arraymultiter_new(PyTypeObject *NPY_UNUSED(subtype), PyObject *args, PyObject *k
 
     multi->numiter = n;
     multi->index = 0;
-    for (i = 0; i < n; i++) {
-        multi->iters[i] = NULL;
-    }
-    for (i = 0; i < n; i++) {
-        arr = PyArray_FromAny(PyTuple_GET_ITEM(args, i), NULL, 0, 0, 0, NULL);
-        if (arr == NULL) {
-            goto fail;
+    i = 0;
+    for (j = 0; j < PyTuple_GET_SIZE(args); ++j) {
+        PyObject *obj = PyTuple_GET_ITEM(args, j);
+        PyArrayIterObject *it;
+
+        if (PyObject_IsInstance(obj, (PyObject *)&PyArrayMultiIter_Type)) {
+            PyArrayMultiIterObject *mit = (PyArrayMultiIterObject *)obj;
+
+            for (k = 0; k < mit->numiter; ++k) {
+                arr = (PyObject *)mit->iters[k]->ao;
+                assert (arr != NULL);
+                it = (PyArrayIterObject *)PyArray_IterNew(arr);
+                if (it == NULL) {
+                    goto fail;
+                }
+                multi->iters[i++] = it;
+            }
         }
-        if ((multi->iters[i] = (PyArrayIterObject *)PyArray_IterNew(arr))
-                == NULL) {
-            goto fail;
+        else {
+            arr = PyArray_FromAny(obj, NULL, 0, 0, 0, NULL);
+            if (arr == NULL) {
+                goto fail;
+            }
+            it = (PyArrayIterObject *)PyArray_IterNew(arr);
+            if (it == NULL) {
+                goto fail;
+            }
+            multi->iters[i++] = it;
+            Py_DECREF(arr);
         }
-        Py_DECREF(arr);
     }
+    assert (i == n);
     if (PyArray_Broadcast(multi) < 0) {
         goto fail;
     }
diff --git a/numpy/core/src/multiarray/methods.c b/numpy/core/src/multiarray/methods.c
index a51453dd1..d06e4a512 100644
--- a/numpy/core/src/multiarray/methods.c
+++ b/numpy/core/src/multiarray/methods.c
@@ -10,6 +10,7 @@
 
 #include "npy_config.h"
 #include "npy_pycompat.h"
+#include "npy_import.h"
 #include "ufunc_override.h"
 #include "common.h"
 #include "ctors.h"
@@ -358,15 +359,23 @@ NPY_NO_EXPORT PyObject *
 PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int offset)
 {
     PyObject *ret = NULL;
+    PyObject *safe;
+    static PyObject *checkfunc = NULL;
 
-    if (offset < 0 || (offset + typed->elsize) > PyArray_DESCR(self)->elsize) {
-        PyErr_Format(PyExc_ValueError,
-                     "Need 0 <= offset <= %d for requested type "
-                     "but received offset = %d",
-                     PyArray_DESCR(self)->elsize-typed->elsize, offset);
-        Py_DECREF(typed);
+    npy_cache_pyfunc("numpy.core._internal", "_getfield_is_safe", &checkfunc);
+    if (checkfunc == NULL) {
         return NULL;
     }
+
+    /* check that we are not reinterpreting memory containing Objects */
+    /* only returns True or raises */
+    safe = PyObject_CallFunction(checkfunc, "OOi", PyArray_DESCR(self),
+                                 typed, offset);
+    if (safe == NULL) {
+        return NULL;
+    }
+    Py_DECREF(safe);
+
     ret = PyArray_NewFromDescr(Py_TYPE(self),
                                typed,
                                PyArray_NDIM(self), PyArray_DIMS(self),
@@ -417,23 +426,12 @@ PyArray_SetField(PyArrayObject *self, PyArray_Descr *dtype,
     PyObject *ret = NULL;
     int retval = 0;
 
-    if (offset < 0 || (offset + dtype->elsize) > PyArray_DESCR(self)->elsize) {
-        PyErr_Format(PyExc_ValueError,
-                     "Need 0 <= offset <= %d for requested type "
-                     "but received offset = %d",
-                     PyArray_DESCR(self)->elsize-dtype->elsize, offset);
-        Py_DECREF(dtype);
-        return -1;
-    }
-    ret = PyArray_NewFromDescr(Py_TYPE(self),
-                           dtype, PyArray_NDIM(self), PyArray_DIMS(self),
-                           PyArray_STRIDES(self), PyArray_BYTES(self) + offset,
-                           PyArray_FLAGS(self), (PyObject *)self);
+    /* getfield returns a view we can write to */
+    ret = PyArray_GetField(self, dtype, offset);
     if (ret == NULL) {
         return -1;
     }
 
-    PyArray_UpdateFlags((PyArrayObject *)ret, NPY_ARRAY_UPDATE_ALL);
     retval = PyArray_CopyObject((PyArrayObject *)ret, val);
     Py_DECREF(ret);
     return retval;
@@ -455,13 +453,6 @@ array_setfield(PyArrayObject *self, PyObject *args, PyObject *kwds)
         return NULL;
     }
 
-    if (PyDataType_REFCHK(PyArray_DESCR(self))) {
-        PyErr_SetString(PyExc_RuntimeError,
-                    "cannot call setfield on an object array");
-        Py_DECREF(dtype);
-        return NULL;
-    }
-
     if (PyArray_SetField(self, dtype, offset, value) < 0) {
         return NULL;
     }
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 369b5a8e1..7980a0de7 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -27,8 +27,8 @@
 #include "numpy/npy_math.h"
 
 #include "npy_config.h"
-
 #include "npy_pycompat.h"
+#include "npy_import.h"
 
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
@@ -64,6 +64,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 /* Only here for API compatibility */
 NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
 
+
 /*NUMPY_API
  * Get Priority from object
  */
@@ -239,7 +240,7 @@ PyArray_AsCArray(PyObject **op, void *ptr, npy_intp *dims, int nd,
     *op = (PyObject *)ap;
     return 0;
 
- fail:
+fail:
     PyErr_SetString(PyExc_MemoryError, "no memory");
     return -1;
 }
@@ -930,7 +931,7 @@ PyArray_InnerProduct(PyObject *op1, PyObject *op2)
     Py_DECREF(ap2);
     return (PyObject *)ret;
 
- fail:
+fail:
     Py_XDECREF(ap1);
     Py_XDECREF(ap2);
     Py_XDECREF(ret);
@@ -1049,7 +1050,8 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
         goto fail;
     }
 
-    op = PyArray_DATA(ret); os = PyArray_DESCR(ret)->elsize;
+    op = PyArray_DATA(ret);
+    os = PyArray_DESCR(ret)->elsize;
     axis = PyArray_NDIM(ap1)-1;
     it1 = (PyArrayIterObject *)
         PyArray_IterAllButAxis((PyObject *)ap1, &axis);
@@ -1083,7 +1085,7 @@ PyArray_MatrixProduct2(PyObject *op1, PyObject *op2, PyArrayObject* out)
     Py_DECREF(ap2);
     return (PyObject *)ret;
 
- fail:
+fail:
     Py_XDECREF(ap1);
     Py_XDECREF(ap2);
     Py_XDECREF(ret);
@@ -1844,7 +1846,7 @@ array_copyto(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 
     Py_RETURN_NONE;
 
- fail:
+fail:
     Py_XDECREF(src);
     Py_XDECREF(wheremask);
     return NULL;
@@ -1887,7 +1889,7 @@ array_empty(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     PyDimMem_FREE(shape.ptr);
     return (PyObject *)ret;
 
- fail:
+fail:
     Py_XDECREF(typecode);
     PyDimMem_FREE(shape.ptr);
     return NULL;
@@ -1918,7 +1920,7 @@ array_empty_like(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
 
     return (PyObject *)ret;
 
- fail:
+fail:
     Py_XDECREF(prototype);
     Py_XDECREF(dtype);
     return NULL;
@@ -2041,7 +2043,7 @@ array_zeros(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
     PyDimMem_FREE(shape.ptr);
     return (PyObject *)ret;
 
- fail:
+fail:
     Py_XDECREF(typecode);
     PyDimMem_FREE(shape.ptr);
     return (PyObject *)ret;
@@ -2369,6 +2371,170 @@ fail:
 }
 
 
+
+/*
+ * matmul
+ *
+ * Implements the protocol used by the '@' operator defined in PEP 364.
+ * Not in the NUMPY API at this time, maybe later.
+ *
+ *
+ * in1:        Left hand side operand
+ * in2:        Right hand side operand
+ * out:        Either NULL, or an array into which the output should be placed.
+ *
+ * Returns NULL on error.
+ * Returns NotImplemented on priority override.
+ */
+static PyObject *
+array_matmul(PyObject *NPY_UNUSED(m), PyObject *args, PyObject* kwds)
+{
+    static PyObject *matmul = NULL;
+    int errval;
+    PyObject *override = NULL;
+    PyObject *in1, *in2, *out = NULL;
+    char* kwlist[] = {"a", "b", "out", NULL };
+    PyArrayObject *ap1, *ap2, *ret = NULL;
+    NPY_ORDER order = NPY_KEEPORDER;
+    NPY_CASTING casting = NPY_SAFE_CASTING;
+    PyArray_Descr *dtype;
+    int nd1, nd2, typenum;
+    char *subscripts;
+    PyArrayObject *ops[2];
+
+    npy_cache_pyfunc("numpy.core.multiarray", "matmul", &matmul);
+    if (matmul == NULL) {
+        return NULL;
+    }
+
+    errval = PyUFunc_CheckOverride(matmul, "__call__",
+                                   args, kwds, &override, 2);
+    if (errval) {
+        return NULL;
+    }
+    else if (override) {
+        return override;
+    }
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O", kwlist,
+                                     &in1, &in2, &out)) {
+        return NULL;
+    }
+
+    if (out == Py_None) {
+        out = NULL;
+    }
+    if (out != NULL && !PyArray_Check(out)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "'out' must be an array");
+        return NULL;
+    }
+
+    dtype = PyArray_DescrFromObject(in1, NULL);
+    dtype = PyArray_DescrFromObject(in2, dtype);
+    if (dtype == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Cannot find a common data type.");
+        return NULL;
+    }
+    typenum = dtype->type_num;
+
+    if (typenum == NPY_OBJECT) {
+        /* matmul is not currently implemented for object arrays */
+        PyErr_SetString(PyExc_TypeError,
+                "Object arrays are not currently supported");
+        Py_DECREF(dtype);
+        return NULL;
+    }
+
+    ap1 = (PyArrayObject *)PyArray_FromAny(in1, dtype, 0, 0,
+                                           NPY_ARRAY_ALIGNED, NULL);
+    if (ap1 == NULL) {
+        return NULL;
+    }
+
+    Py_INCREF(dtype);
+    ap2 = (PyArrayObject *)PyArray_FromAny(in2, dtype, 0, 0,
+                                           NPY_ARRAY_ALIGNED, NULL);
+    if (ap2 == NULL) {
+        Py_DECREF(ap1);
+        return NULL;
+    }
+
+    if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
+        /* Scalars are rejected */
+        PyErr_SetString(PyExc_ValueError,
+                "Scalar operands are not allowed, use '*' instead");
+        return NULL;
+    }
+
+    nd1 = PyArray_NDIM(ap1);
+    nd2 = PyArray_NDIM(ap2);
+
+#if defined(HAVE_CBLAS)
+    if (nd1 <= 2 && nd2 <= 2 &&
+            (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+             NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+        return cblas_matrixproduct(typenum, ap1, ap2, out);
+    }
+#endif
+
+    /*
+     * Use einsum for the stacked cases. This is a quick implementation
+     * to avoid setting up the proper iterators. Einsum broadcasts, so
+     * we need to check dimensions before the call.
+     */
+    if (nd1 == 1 && nd2 == 1) {
+        /* vector vector */
+        if (PyArray_DIM(ap1, 0) != PyArray_DIM(ap2, 0)) {
+            dot_alignment_error(ap1, 0, ap2, 0);
+            goto fail;
+        }
+        subscripts = "i, i";
+    }
+    else if (nd1 == 1) {
+        /* vector  matrix */
+        if (PyArray_DIM(ap1, 0) != PyArray_DIM(ap2, nd2 - 2)) {
+            dot_alignment_error(ap1, 0, ap2, nd2 - 2);
+            goto fail;
+        }
+        subscripts = "i, ...ij";
+    }
+    else if (nd2 == 1) {
+        /* matrix  vector */
+        if (PyArray_DIM(ap1, nd1 - 1) != PyArray_DIM(ap2, 0)) {
+            dot_alignment_error(ap1, nd1 - 1, ap2, 0);
+            goto fail;
+        }
+        subscripts = "...i, i";
+    }
+    else {
+        /* matrix * matrix */
+        if (PyArray_DIM(ap1, nd1 - 1) != PyArray_DIM(ap2, nd2 - 2)) {
+            dot_alignment_error(ap1, nd1 - 1, ap2, nd2 - 2);
+            goto fail;
+        }
+        subscripts = "...ij, ...jk";
+    }
+    ops[0] = ap1;
+    ops[1] = ap2;
+    ret = PyArray_EinsteinSum(subscripts, 2, ops, NULL, order, casting, out);
+    Py_DECREF(ap1);
+    Py_DECREF(ap2);
+
+    /* If no output was supplied, possibly convert to a scalar */
+    if (ret != NULL && out == NULL) {
+        ret = PyArray_Return((PyArrayObject *)ret);
+    }
+    return (PyObject *)ret;
+
+fail:
+    Py_XDECREF(ap1);
+    Py_XDECREF(ap2);
+    return NULL;
+}
+
+
 static int
 einsum_sub_op_from_str(PyObject *args, PyObject **str_obj, char **subscripts,
                        PyArrayObject **op)
@@ -2862,7 +3028,7 @@ array__reconstruct(PyObject *NPY_UNUSED(dummy), PyObject *args)
 
     return ret;
 
- fail:
+fail:
     evil_global_disable_warn_O4O8_flag = 0;
 
     Py_XDECREF(dtype);
@@ -3090,7 +3256,7 @@ PyArray_Where(PyObject *condition, PyObject *x, PyObject *y)
         return ret;
     }
 
- fail:
+fail:
     Py_DECREF(arr);
     Py_XDECREF(ax);
     Py_XDECREF(ay);
@@ -3936,6 +4102,9 @@ static struct PyMethodDef array_module_methods[] = {
     {"vdot",
         (PyCFunction)array_vdot,
         METH_VARARGS | METH_KEYWORDS, NULL},
+    {"matmul",
+        (PyCFunction)array_matmul,
+        METH_VARARGS | METH_KEYWORDS, NULL},
     {"einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 168799f11..a2160afd8 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -8,11 +8,10 @@
 #include "numpy/arrayobject.h"
 
 #include "npy_config.h"
-
 #include "npy_pycompat.h"
-
-#include "number.h"
+#include "npy_import.h"
 #include "common.h"
+#include "number.h"
 
 /*************************************************************************
  ****************   Implement Number Protocol ****************************
@@ -386,6 +385,24 @@ array_remainder(PyArrayObject *m1, PyObject *m2)
     return PyArray_GenericBinaryFunction(m1, m2, n_ops.remainder);
 }
 
+
+#if PY_VERSION_HEX >= 0x03050000
+/* Need this to be version dependent on account of the slot check */
+static PyObject *
+array_matrix_multiply(PyArrayObject *m1, PyObject *m2)
+{
+    static PyObject *matmul = NULL;
+
+    npy_cache_pyfunc("numpy.core.multiarray", "matmul", &matmul);
+    if (matmul == NULL) {
+        return NULL;
+    }
+    GIVE_UP_IF_HAS_RIGHT_BINOP(m1, m2, "__matmul__", "__rmatmul__",
+                               0, nb_matrix_multiply);
+    return PyArray_GenericBinaryFunction(m1, m2, matmul);
+}
+#endif
+
 /* Determine if object is a scalar and if so, convert the object
  *   to a double and place it in the out_exponent argument
  *   and return the "scalar kind" as a result.   If the object is
@@ -723,6 +740,7 @@ array_inplace_true_divide(PyArrayObject *m1, PyObject *m2)
                                                 n_ops.true_divide);
 }
 
+
 static int
 _array_nonzero(PyArrayObject *mp)
 {
@@ -1066,5 +1084,9 @@ NPY_NO_EXPORT PyNumberMethods array_as_number = {
     (binaryfunc)array_true_divide,              /*nb_true_divide*/
     (binaryfunc)array_inplace_floor_divide,     /*nb_inplace_floor_divide*/
     (binaryfunc)array_inplace_true_divide,      /*nb_inplace_true_divide*/
-    (unaryfunc)array_index,                     /* nb_index */
+    (unaryfunc)array_index,                     /*nb_index */
+#if PY_VERSION_HEX >= 0x03050000
+    (binaryfunc)array_matrix_multiply,          /*nb_matrix_multiply*/
+    (binaryfunc)NULL,                           /*nb_inplacematrix_multiply*/
+#endif
 };
diff --git a/numpy/core/src/private/npy_import.h b/numpy/core/src/private/npy_import.h
new file mode 100644
index 000000000..a75c59884
--- /dev/null
+++ b/numpy/core/src/private/npy_import.h
@@ -0,0 +1,32 @@
+#ifndef NPY_IMPORT_H
+#define NPY_IMPORT_H
+
+#include <Python.h>
+
+/*! \brief Fetch and cache Python function.
+ *
+ * Import a Python function and cache it for use. The function checks if
+ * cache is NULL, and if not NULL imports the Python function specified by
+ * \a module and \a function, increments its reference count, and stores
+ * the result in \a cache. Usually \a cache will be a static variable and
+ * should be initialized to NULL. On error \a cache will contain NULL on
+ * exit,
+ *
+ * @param module Absolute module name.
+ * @param function Function name.
+ * @param cache Storage location for imported function.
+ */
+NPY_INLINE static void
+npy_cache_pyfunc(const char *module, const char *function, PyObject **cache)
+{
+    if (*cache == NULL) {
+        PyObject *mod = PyImport_ImportModule(module);
+
+        if (mod != NULL) {
+            *cache = PyObject_GetAttrString(mod, function);
+            Py_DECREF(mod);
+        }
+    }
+}
+
+#endif
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 9040c262b..b293cdbbc 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -245,6 +245,14 @@ class TestRecord(TestCase):
                                    ('f1', 'datetime64[Y]'),
                                    ('f2', 'i8')]))
 
+    def test_from_dictproxy(self):
+        # Tests for PR #5920
+        dt = np.dtype({'names': ['a', 'b'], 'formats': ['i4', 'f4']})
+        assert_dtype_equal(dt, np.dtype(dt.fields))
+        dt2 = np.dtype((np.void, dt.fields))
+        assert_equal(dt2.fields, dt.fields)
+
+
 class TestSubarray(TestCase):
     def test_single_subarray(self):
         a = np.dtype((np.int, (2)))
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 33b75d490..74c57e18a 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -8,6 +8,7 @@ import shutil
 import warnings
 import operator
 import io
+import itertools
 if sys.version_info[0] >= 3:
     import builtins
 else:
@@ -808,6 +809,14 @@ class TestStructured(TestCase):
             t = [('a', '>i4'), ('b', '<f8'), ('c', 'i4')]
             assert_(not np.can_cast(a.dtype, t, casting=casting))
 
+    def test_objview(self):
+        # https://github.com/numpy/numpy/issues/3286
+        a = np.array([], dtype=[('a', 'f'), ('b', 'f'), ('c', 'O')])
+        a[['a', 'b']] # TypeError?
+
+        # https://github.com/numpy/numpy/issues/3253
+        dat2 = np.zeros(3, [('A', 'i'), ('B', '|O')])
+        new2 = dat2[['B', 'A']] # TypeError?
 
 class TestBool(TestCase):
     def test_test_interning(self):
@@ -3576,8 +3585,9 @@ class TestRecord(TestCase):
 
 class TestView(TestCase):
     def test_basic(self):
-        x = np.array([(1, 2, 3, 4), (5, 6, 7, 8)], dtype=[('r', np.int8), ('g', np.int8),
-                                                  ('b', np.int8), ('a', np.int8)])
+        x = np.array([(1, 2, 3, 4), (5, 6, 7, 8)],
+                      dtype=[('r', np.int8), ('g', np.int8),
+                             ('b', np.int8), ('a', np.int8)])
         # We must be specific about the endianness here:
         y = x.view(dtype='<i4')
         # ... and again without the keyword.
@@ -3927,7 +3937,6 @@ class TestDot(TestCase):
         assert_raises(TypeError, c.dot, b)
 
     def test_accelerate_framework_sgemv_fix(self):
-        from itertools import product
         if sys.platform != 'darwin':
             return
 
@@ -3954,7 +3963,7 @@ class TestDot(TestCase):
         s = aligned_array((100, 100), 15, np.float32)
         np.dot(s, m) # this will always segfault if the bug is present
 
-        testdata = product((15,32), (10000,), (200,89), ('C','F'))
+        testdata = itertools.product((15,32), (10000,), (200,89), ('C','F'))
         for align, m, n, a_order in testdata:
             # Calculation in double precision
             A_d = np.random.rand(m, n)
@@ -3994,6 +4003,287 @@ class TestDot(TestCase):
             assert_dot_close(A_f_12, X_f_2, desired)
 
 
+class MatmulCommon():
+    """Common tests for '@' operator and numpy.matmul.
+
+    Do not derive from TestCase to avoid nose running it.
+
+    """
+    # Should work with these types. Will want to add
+    # "O" at some point
+    types = "?bhilqBHILQefdgFDG"
+
+    def test_exceptions(self):
+        dims = [
+            ((1,), (2,)),            # mismatched vector vector
+            ((2, 1,), (2,)),         # mismatched matrix vector
+            ((2,), (1, 2)),          # mismatched vector matrix
+            ((1, 2), (3, 1)),        # mismatched matrix matrix
+            ((1,), ()),              # vector scalar
+            ((), (1)),               # scalar vector
+            ((1, 1), ()),            # matrix scalar
+            ((), (1, 1)),            # scalar matrix
+            ((2, 2, 1), (3, 1, 2)),  # cannot broadcast
+            ]
+
+        for dt, (dm1, dm2) in itertools.product(self.types, dims):
+            a = np.ones(dm1, dtype=dt)
+            b = np.ones(dm2, dtype=dt)
+            assert_raises(ValueError, self.matmul, a, b)
+
+    def test_shapes(self):
+        dims = [
+            ((1, 1), (2, 1, 1)),     # broadcast first argument
+            ((2, 1, 1), (1, 1)),     # broadcast second argument
+            ((2, 1, 1), (2, 1, 1)),  # matrix stack sizes match
+            ]
+
+        for dt, (dm1, dm2) in itertools.product(self.types, dims):
+            a = np.ones(dm1, dtype=dt)
+            b = np.ones(dm2, dtype=dt)
+            res = self.matmul(a, b)
+            assert_(res.shape == (2, 1, 1))
+
+        # vector vector returns scalars.
+        for dt in self.types:
+            a = np.ones((2,), dtype=dt)
+            b = np.ones((2,), dtype=dt)
+            c = self.matmul(a, b)
+            assert_(np.array(c).shape == ())
+
+    def test_result_types(self):
+        mat = np.ones((1,1))
+        vec = np.ones((1,))
+        for dt in self.types:
+            m = mat.astype(dt)
+            v = vec.astype(dt)
+            for arg in [(m, v), (v, m), (m, m)]:
+                res = matmul(*arg)
+                assert_(res.dtype == dt)
+
+            # vector vector returns scalars
+            res = matmul(v, v)
+            assert_(type(res) is dtype(dt).type)
+
+    def test_vector_vector_values(self):
+        vec = np.array([1, 2])
+        tgt = 5
+        for dt in self.types[1:]:
+            v1 = vec.astype(dt)
+            res = matmul(v1, v1)
+            assert_equal(res, tgt)
+
+        # boolean type
+        vec = np.array([True, True], dtype='?')
+        res = matmul(vec, vec)
+        assert_equal(res, True)
+
+    def test_vector_matrix_values(self):
+        vec = np.array([1, 2])
+        mat1 = np.array([[1, 2], [3, 4]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([7, 10])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+        for dt in self.types[1:]:
+            v = vec.astype(dt)
+            m1 = mat1.astype(dt)
+            m2 = mat2.astype(dt)
+            res = matmul(v, m1)
+            assert_equal(res, tgt1)
+            res = matmul(v, m2)
+            assert_equal(res, tgt2)
+
+        # boolean type
+        vec = np.array([True, False])
+        mat1 = np.array([[True, False], [False, True]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([True, False])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+
+        res = matmul(vec, mat1)
+        assert_equal(res, tgt1)
+        res = matmul(vec, mat2)
+        assert_equal(res, tgt2)
+
+    def test_matrix_vector_values(self):
+        vec = np.array([1, 2])
+        mat1 = np.array([[1, 2], [3, 4]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([5, 11])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+        for dt in self.types[1:]:
+            v = vec.astype(dt)
+            m1 = mat1.astype(dt)
+            m2 = mat2.astype(dt)
+            res = matmul(m1, v)
+            assert_equal(res, tgt1)
+            res = matmul(m2, v)
+            assert_equal(res, tgt2)
+
+        # boolean type
+        vec = np.array([True, False])
+        mat1 = np.array([[True, False], [False, True]])
+        mat2 = np.stack([mat1]*2, axis=0)
+        tgt1 = np.array([True, False])
+        tgt2 = np.stack([tgt1]*2, axis=0)
+
+        res = matmul(vec, mat1)
+        assert_equal(res, tgt1)
+        res = matmul(vec, mat2)
+        assert_equal(res, tgt2)
+
+    def test_matrix_matrix_values(self):
+        mat1 = np.array([[1, 2], [3, 4]])
+        mat2 = np.array([[1, 0], [1, 1]])
+        mat12 = np.stack([mat1, mat2], axis=0)
+        mat21 = np.stack([mat2, mat1], axis=0)
+        tgt11 = np.array([[7, 10], [15, 22]])
+        tgt12 = np.array([[3, 2], [7, 4]])
+        tgt21 = np.array([[1, 2], [4, 6]])
+        tgt22 = np.array([[1, 0], [2, 1]])
+        tgt12_21 = np.stack([tgt12, tgt21], axis=0)
+        tgt11_12 = np.stack((tgt11, tgt12), axis=0)
+        tgt11_21 = np.stack((tgt11, tgt21), axis=0)
+        for dt in self.types[1:]:
+            m1 = mat1.astype(dt)
+            m2 = mat2.astype(dt)
+            m12 = mat12.astype(dt)
+            m21 = mat21.astype(dt)
+
+            # matrix @ matrix
+            res = matmul(m1, m2)
+            assert_equal(res, tgt12)
+            res = matmul(m2, m1)
+            assert_equal(res, tgt21)
+
+            # stacked @ matrix
+            res = self.matmul(m12, m1)
+            assert_equal(res, tgt11_21)
+
+            # matrix @ stacked
+            res = self.matmul(m1, m12)
+            assert_equal(res, tgt11_12)
+
+            # stacked @ stacked
+            res = self.matmul(m12, m21)
+            assert_equal(res, tgt12_21)
+
+        # boolean type
+        m1 = np.array([[1, 1], [0, 0]], dtype=np.bool_)
+        m2 = np.array([[1, 0], [1, 1]], dtype=np.bool_)
+        m12 = np.stack([m1, m2], axis=0)
+        m21 = np.stack([m2, m1], axis=0)
+        tgt11 = m1
+        tgt12 = m1
+        tgt21 = np.array([[1, 1], [1, 1]], dtype=np.bool_)
+        tgt22 = m2
+        tgt12_21 = np.stack([tgt12, tgt21], axis=0)
+        tgt11_12 = np.stack((tgt11, tgt12), axis=0)
+        tgt11_21 = np.stack((tgt11, tgt21), axis=0)
+
+        # matrix @ matrix
+        res = matmul(m1, m2)
+        assert_equal(res, tgt12)
+        res = matmul(m2, m1)
+        assert_equal(res, tgt21)
+
+        # stacked @ matrix
+        res = self.matmul(m12, m1)
+        assert_equal(res, tgt11_21)
+
+        # matrix @ stacked
+        res = self.matmul(m1, m12)
+        assert_equal(res, tgt11_12)
+
+        # stacked @ stacked
+        res = self.matmul(m12, m21)
+        assert_equal(res, tgt12_21)
+
+    def test_numpy_ufunc_override(self):
+
+        class A(np.ndarray):
+            def __new__(cls, *args, **kwargs):
+                return np.array(*args, **kwargs).view(cls)
+            def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
+                return "A"
+
+        class B(np.ndarray):
+            def __new__(cls, *args, **kwargs):
+                return np.array(*args, **kwargs).view(cls)
+            def __numpy_ufunc__(self, ufunc, method, pos, inputs, **kwargs):
+                return NotImplemented
+
+        a = A([1, 2])
+        b = B([1, 2])
+        c = ones(2)
+        assert_equal(self.matmul(a, b), "A")
+        assert_equal(self.matmul(b, a), "A")
+        assert_raises(TypeError, self.matmul, b, c)
+
+
+class TestMatmul(MatmulCommon, TestCase):
+    matmul = np.matmul
+
+    def test_out_arg(self):
+        a = np.ones((2, 2), dtype=np.float)
+        b = np.ones((2, 2), dtype=np.float)
+        tgt = np.full((2,2), 2, dtype=np.float)
+
+        # test as positional argument
+        msg = "out positional argument"
+        out = np.zeros((2, 2), dtype=np.float)
+        self.matmul(a, b, out)
+        assert_array_equal(out, tgt, err_msg=msg)
+
+        # test as keyword argument
+        msg = "out keyword argument"
+        out = np.zeros((2, 2), dtype=np.float)
+        self.matmul(a, b, out=out)
+        assert_array_equal(out, tgt, err_msg=msg)
+
+        # test out with not allowed type cast (safe casting)
+        # einsum and cblas raise different error types, so
+        # use Exception.
+        msg = "out argument with illegal cast"
+        out = np.zeros((2, 2), dtype=np.int32)
+        assert_raises(Exception, self.matmul, a, b, out=out)
+
+        # skip following tests for now, cblas does not allow non-contiguous
+        # outputs and consistency with dot would require same type,
+        # dimensions, subtype, and c_contiguous.
+
+        # test out with allowed type cast
+        # msg = "out argument with allowed cast"
+        # out = np.zeros((2, 2), dtype=np.complex128)
+        # self.matmul(a, b, out=out)
+        # assert_array_equal(out, tgt, err_msg=msg)
+
+        # test out non-contiguous
+        # msg = "out argument with non-contiguous layout"
+        # c = np.zeros((2, 2, 2), dtype=np.float)
+        # self.matmul(a, b, out=c[..., 0])
+        # assert_array_equal(c, tgt, err_msg=msg)
+
+
+if sys.version_info[:2] >= (3, 5):
+    class TestMatmulOperator(MatmulCommon, TestCase):
+        from operator import matmul
+
+        def test_array_priority_override(self):
+
+            class A(object):
+                __array_priority__ = 1000
+                def __matmul__(self, other):
+                    return "A"
+                def __rmatmul__(self, other):
+                    return "A"
+
+            a = A()
+            b = ones(2)
+            assert_equal(self.matmul(a, b), "A")
+            assert_equal(self.matmul(b, a), "A")
+
+
 class TestInner(TestCase):
 
     def test_inner_scalar_and_matrix_of_objects(self):
@@ -5242,6 +5532,89 @@ class TestHashing(TestCase):
         x = np.array([])
         self.assertFalse(isinstance(x, collections.Hashable))
 
+from numpy.core._internal import _view_is_safe
+
+class TestObjViewSafetyFuncs:
+    def test_view_safety(self):
+        psize = dtype('p').itemsize
+
+        # creates dtype but with extra character code - for missing 'p' fields
+        def mtype(s):
+            n, offset, fields = 0, 0, []
+            for c in s.split(','): #subarrays won't work
+                if c != '-':
+                    fields.append(('f{0}'.format(n), c, offset))
+                    n += 1
+                offset += dtype(c).itemsize if c != '-' else psize
+
+            names, formats, offsets = zip(*fields)
+            return dtype({'names': names, 'formats': formats,
+                          'offsets': offsets, 'itemsize': offset})
+
+        # test nonequal itemsizes with objects:
+        # these should succeed:
+        _view_is_safe(dtype('O,p,O,p'), dtype('O,p,O,p,O,p'))
+        _view_is_safe(dtype('O,O'), dtype('O,O,O'))
+        # these should fail:
+        assert_raises(TypeError, _view_is_safe, dtype('O,O,p'), dtype('O,O'))
+        assert_raises(TypeError, _view_is_safe, dtype('O,O,p'), dtype('O,p'))
+        assert_raises(TypeError, _view_is_safe, dtype('O,O,p'), dtype('p,O'))
+
+        # test nonequal itemsizes with missing fields:
+        # these should succeed:
+        _view_is_safe(mtype('-,p,-,p'), mtype('-,p,-,p,-,p'))
+        _view_is_safe(dtype('p,p'), dtype('p,p,p'))
+        # these should fail:
+        assert_raises(TypeError, _view_is_safe, mtype('p,p,-'), mtype('p,p'))
+        assert_raises(TypeError, _view_is_safe, mtype('p,p,-'), mtype('p,-'))
+        assert_raises(TypeError, _view_is_safe, mtype('p,p,-'), mtype('-,p'))
+
+        # scans through positions at which we can view a type
+        def scanView(d1, otype):
+            goodpos = []
+            for shift in range(d1.itemsize - dtype(otype).itemsize+1):
+                d2 = dtype({'names': ['f0'], 'formats': [otype],
+                            'offsets': [shift], 'itemsize': d1.itemsize})
+                try:
+                    _view_is_safe(d1, d2)
+                except TypeError:
+                    pass
+                else:
+                    goodpos.append(shift)
+            return goodpos
+
+        # test partial overlap with object field
+        assert_equal(scanView(dtype('p,O,p,p,O,O'), 'p'),
+                     [0] + list(range(2*psize, 3*psize+1)))
+        assert_equal(scanView(dtype('p,O,p,p,O,O'), 'O'),
+                     [psize, 4*psize, 5*psize])
+
+        # test partial overlap with missing field
+        assert_equal(scanView(mtype('p,-,p,p,-,-'), 'p'),
+                     [0] + list(range(2*psize, 3*psize+1)))
+
+        # test nested structures with objects:
+        nestedO = dtype([('f0', 'p'), ('f1', 'p,O,p')])
+        assert_equal(scanView(nestedO, 'p'), list(range(psize+1)) + [3*psize])
+        assert_equal(scanView(nestedO, 'O'), [2*psize])
+
+        # test nested structures with missing fields:
+        nestedM = dtype([('f0', 'p'), ('f1', mtype('p,-,p'))])
+        assert_equal(scanView(nestedM, 'p'), list(range(psize+1)) + [3*psize])
+
+        # test subarrays with objects
+        subarrayO = dtype('p,(2,3)O,p')
+        assert_equal(scanView(subarrayO, 'p'), [0, 7*psize])
+        assert_equal(scanView(subarrayO, 'O'), 
+                     list(range(psize, 6*psize+1, psize)))
+
+        #test dtype with overlapping fields
+        overlapped = dtype({'names': ['f0', 'f1', 'f2', 'f3'],
+                            'formats': ['p', 'p', 'p', 'p'],
+                            'offsets': [0, 1, 3*psize-1, 3*psize],
+                            'itemsize': 4*psize})
+        assert_equal(scanView(overlapped, 'p'), [0, 1, 3*psize-1, 3*psize])
+
 
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_numeric.py b/numpy/core/tests/test_numeric.py
index ee304a7af..7400366ac 100644
--- a/numpy/core/tests/test_numeric.py
+++ b/numpy/core/tests/test_numeric.py
@@ -2226,6 +2226,7 @@ class TestCross(TestCase):
         for axisc in range(-2, 2):
             assert_equal(np.cross(u, u, axisc=axisc).shape, (3, 4))
 
+
 def test_outer_out_param():
     arr1 = np.ones((5,))
     arr2 = np.ones((2,))
@@ -2236,6 +2237,7 @@ def test_outer_out_param():
     assert_equal(res1, out1)
     assert_equal(np.outer(arr2, arr3, out2), out2)
 
+
 class TestRequire(object):
     flag_names = ['C', 'C_CONTIGUOUS', 'CONTIGUOUS',
                   'F', 'F_CONTIGUOUS', 'FORTRAN',
@@ -2310,5 +2312,31 @@ class TestRequire(object):
             yield self.set_and_check_flag, flag, None, a
 
 
+class TestBroadcast(TestCase):
+    def test_broadcast_in_args(self):
+        # gh-5881
+        arrs = [np.empty((6, 7)), np.empty((5, 6, 1)), np.empty((7,)),
+                np.empty((5, 1, 7))]
+        mits = [np.broadcast(*arrs),
+                np.broadcast(np.broadcast(*arrs[:2]), np.broadcast(*arrs[2:])),
+                np.broadcast(arrs[0], np.broadcast(*arrs[1:-1]), arrs[-1])]
+        for mit in mits:
+            assert_equal(mit.shape, (5, 6, 7))
+            assert_equal(mit.nd, 3)
+            assert_equal(mit.numiter, 4)
+            for a, ia in zip(arrs, mit.iters):
+                assert_(a is ia.base)
+
+    def test_number_of_arguments(self):
+        arr = np.empty((5,))
+        for j in range(35):
+            arrs = [arr] * j
+            if j < 2 or j > 32:
+                assert_raises(ValueError, np.broadcast, *arrs)
+            else:
+                mit = np.broadcast(*arrs)
+                assert_equal(mit.numiter, j)
+
+
 if __name__ == "__main__":
     run_module_suite()
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index a7895a30a..b07b3c876 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -149,19 +149,32 @@ class TestFromrecords(TestCase):
         assert_equal(a.foo[0] == a.foo[1], False)
 
     def test_recarray_returntypes(self):
-        a = np.rec.array([('abc ', (1,1), 1), ('abc', (2,3), 1)],
+        qux_fields = {'C': (np.dtype('S5'), 0), 'D': (np.dtype('S5'), 6)}
+        a = np.rec.array([('abc ', (1,1), 1, ('abcde', 'fgehi')),
+                          ('abc', (2,3), 1, ('abcde', 'jklmn'))],
                          dtype=[('foo', 'S4'),
                                 ('bar', [('A', int), ('B', int)]),
-                                ('baz', int)])
+                                ('baz', int), ('qux', qux_fields)])
         assert_equal(type(a.foo), np.ndarray)
         assert_equal(type(a['foo']), np.ndarray)
         assert_equal(type(a.bar), np.recarray)
         assert_equal(type(a['bar']), np.recarray)
         assert_equal(a.bar.dtype.type, np.record)
+        assert_equal(type(a['qux']), np.recarray)
+        assert_equal(a.qux.dtype.type, np.record)
+        assert_equal(dict(a.qux.dtype.fields), qux_fields)
         assert_equal(type(a.baz), np.ndarray)
         assert_equal(type(a['baz']), np.ndarray)
         assert_equal(type(a[0].bar), np.record)
+        assert_equal(type(a[0]['bar']), np.record)
         assert_equal(a[0].bar.A, 1)
+        assert_equal(a[0].bar['A'], 1)
+        assert_equal(a[0]['bar'].A, 1)
+        assert_equal(a[0]['bar']['A'], 1)
+        assert_equal(a[0].qux.D, asbytes('fgehi'))
+        assert_equal(a[0].qux['D'], asbytes('fgehi'))
+        assert_equal(a[0]['qux'].D, asbytes('fgehi'))
+        assert_equal(a[0]['qux']['D'], asbytes('fgehi'))
 
 
 class TestRecord(TestCase):
@@ -206,6 +219,15 @@ class TestRecord(TestCase):
         assert_equal(a, pickle.loads(pickle.dumps(a)))
         assert_equal(a[0], pickle.loads(pickle.dumps(a[0])))
 
+    def test_objview_record(self):
+        # https://github.com/numpy/numpy/issues/2599
+        dt = np.dtype([('foo', 'i8'), ('bar', 'O')])
+        r = np.zeros((1,3), dtype=dt).view(np.recarray)
+        r.foo = np.array([1, 2, 3]) # TypeError?
+
+        # https://github.com/numpy/numpy/issues/3256
+        ra = np.recarray((2,), dtype=[('x', object), ('y', float), ('z', int)])
+        ra[['x','y']] #TypeError?
 
 def test_find_duplicate():
     l1 = [1, 2, 3, 4, 5, 6]
diff --git a/numpy/core/tests/test_regression.py b/numpy/core/tests/test_regression.py
index 9e8511a01..399470214 100644
--- a/numpy/core/tests/test_regression.py
+++ b/numpy/core/tests/test_regression.py
@@ -1739,10 +1739,9 @@ class TestRegression(TestCase):
         assert_equal(np.add.accumulate(x[:-1, 0]), [])
 
     def test_objectarray_setfield(self):
-        # Setfield directly manipulates the raw array data,
-        # so is invalid for object arrays.
+        # Setfield should not overwrite Object fields with non-Object data
         x = np.array([1, 2, 3], dtype=object)
-        assert_raises(RuntimeError, x.setfield, 4, np.int32, 0)
+        assert_raises(TypeError, x.setfield, 4, np.int32, 0)
 
     def test_setting_rank0_string(self):
         "Ticket #1736"
diff --git a/numpy/distutils/__init__.py b/numpy/distutils/__init__.py
index c16b10ae5..9297185ef 100644
--- a/numpy/distutils/__init__.py
+++ b/numpy/distutils/__init__.py
@@ -2,36 +2,20 @@ from __future__ import division, absolute_import, print_function
 
 import sys
 
-if sys.version_info[0] < 3:
-    from .__version__ import version as __version__
-    # Must import local ccompiler ASAP in order to get
-    # customized CCompiler.spawn effective.
-    from . import ccompiler
-    from . import unixccompiler
+from .__version__ import version as __version__
+# Must import local ccompiler ASAP in order to get
+# customized CCompiler.spawn effective.
+from . import ccompiler
+from . import unixccompiler
 
-    from .info import __doc__
-    from .npy_pkg_config import *
+from .info import __doc__
+from .npy_pkg_config import *
 
-    try:
-        from . import __config__
-        _INSTALLED = True
-    except ImportError:
-        _INSTALLED = False
-else:
-    from numpy.distutils.__version__ import version as __version__
-    # Must import local ccompiler ASAP in order to get
-    # customized CCompiler.spawn effective.
-    import numpy.distutils.ccompiler
-    import numpy.distutils.unixccompiler
-
-    from numpy.distutils.info import __doc__
-    from numpy.distutils.npy_pkg_config import *
-
-    try:
-        import numpy.distutils.__config__
-        _INSTALLED = True
-    except ImportError:
-        _INSTALLED = False
+try:
+    from . import __config__
+    _INSTALLED = True
+except ImportError:
+    _INSTALLED = False
 
 if _INSTALLED:
     from numpy.testing import Tester
diff --git a/numpy/doc/indexing.py b/numpy/doc/indexing.py
index d3f442c21..0891e7c8d 100644
--- a/numpy/doc/indexing.py
+++ b/numpy/doc/indexing.py
@@ -50,7 +50,7 @@ than dimensions, one gets a subdimensional array. For example: ::
 
 That is, each index specified selects the array corresponding to the
 rest of the dimensions selected. In the above example, choosing 0
-means that remaining dimension of lenth 5 is being left unspecified,
+means that the remaining dimension of length 5 is being left unspecified,
 and that what is returned is an array of that dimensionality and size.
 It must be noted that the returned array is not a copy of the original,
 but points to the same values in memory as does the original array.
@@ -62,7 +62,7 @@ element being returned. That is: ::
     2
 
 So note that ``x[0,2] = x[0][2]`` though the second case is more
-inefficient a new temporary array is created after the first index
+inefficient as a new temporary array is created after the first index
 that is subsequently indexed by 2.
 
 Note to those used to IDL or Fortran memory order as it relates to
diff --git a/numpy/fft/fftpack.py b/numpy/fft/fftpack.py
index 4efb2a9a0..e12ae1eec 100644
--- a/numpy/fft/fftpack.py
+++ b/numpy/fft/fftpack.py
@@ -35,13 +35,13 @@ from __future__ import division, absolute_import, print_function
 __all__ = ['fft', 'ifft', 'rfft', 'irfft', 'hfft', 'ihfft', 'rfftn',
            'irfftn', 'rfft2', 'irfft2', 'fft2', 'ifft2', 'fftn', 'ifftn']
 
-from numpy.core import asarray, zeros, swapaxes, shape, conjugate, \
-     take
+from numpy.core import array, asarray, zeros, swapaxes, shape, conjugate, take
 from . import fftpack_lite as fftpack
 
 _fft_cache = {}
 _real_fft_cache = {}
 
+
 def _raw_fft(a, n=None, axis=-1, init_function=fftpack.cffti,
              work_function=fftpack.cfftf, fft_cache=_fft_cache):
     a = asarray(a)
@@ -248,8 +248,8 @@ def ifft(a, n=None, axis=-1):
     >>> plt.show()
 
     """
-
-    a = asarray(a).astype(complex)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=complex)
     if n is None:
         n = shape(a)[axis]
     return _raw_fft(a, n, axis, fftpack.cffti, fftpack.cfftb, _fft_cache) / n
@@ -329,8 +329,8 @@ def rfft(a, n=None, axis=-1):
     exploited to compute only the non-negative frequency terms.
 
     """
-
-    a = asarray(a).astype(float)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=float)
     return _raw_fft(a, n, axis, fftpack.rffti, fftpack.rfftf, _real_fft_cache)
 
 
@@ -410,8 +410,8 @@ def irfft(a, n=None, axis=-1):
     specified, and the output array is purely real.
 
     """
-
-    a = asarray(a).astype(complex)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=complex)
     if n is None:
         n = (shape(a)[axis] - 1) * 2
     return _raw_fft(a, n, axis, fftpack.rffti, fftpack.rfftb,
@@ -484,8 +484,8 @@ def hfft(a, n=None, axis=-1):
            [ 2., -2.]])
 
     """
-
-    a = asarray(a).astype(complex)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=complex)
     if n is None:
         n = (shape(a)[axis] - 1) * 2
     return irfft(conjugate(a), n, axis) * n
@@ -538,8 +538,8 @@ def ihfft(a, n=None, axis=-1):
     array([ 1.-0.j,  2.-0.j,  3.-0.j,  4.-0.j])
 
     """
-
-    a = asarray(a).astype(float)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=float)
     if n is None:
         n = shape(a)[axis]
     return conjugate(rfft(a, n, axis))/n
@@ -1007,8 +1007,8 @@ def rfftn(a, s=None, axes=None):
             [ 0.+0.j,  0.+0.j]]])
 
     """
-
-    a = asarray(a).astype(float)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=float)
     s, axes = _cook_nd_args(a, s, axes)
     a = rfft(a, s[-1], axes[-1])
     for ii in range(len(axes)-1):
@@ -1128,8 +1128,8 @@ def irfftn(a, s=None, axes=None):
             [ 1.,  1.]]])
 
     """
-
-    a = asarray(a).astype(complex)
+    # The copy may be required for multithreading.
+    a = array(a, copy=True, dtype=complex)
     s, axes = _cook_nd_args(a, s, axes, invreal=1)
     for ii in range(len(axes)-1):
         a = ifft(a, s[ii], axes[ii])
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 04063755c..94d63c027 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -2320,7 +2320,7 @@ def hanning(M):
     .. math::  w(n) = 0.5 - 0.5cos\\left(\\frac{2\\pi{n}}{M-1}\\right)
                \\qquad 0 \\leq n \\leq M-1
 
-    The Hanning was named for Julius van Hann, an Austrian meteorologist.
+    The Hanning was named for Julius von Hann, an Austrian meteorologist.
     It is also known as the Cosine Bell. Some authors prefer that it be
     called a Hann window, to help avoid confusion with the very similar
     Hamming window.
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 51a2077eb..13e75cbd0 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -700,6 +700,36 @@ class TestJoinBy2(TestCase):
         assert_equal(test.dtype, control.dtype)
         assert_equal(test, control)
 
+class TestAppendFieldsObj(TestCase):
+    """
+    Test append_fields with arrays containing objects
+    """
+    # https://github.com/numpy/numpy/issues/2346
+
+    def setUp(self):
+        from datetime import date
+        self.data = dict(obj=date(2000, 1, 1))
+
+    def test_append_to_objects(self):
+        "Test append_fields when the base array contains objects"
+        obj = self.data['obj']
+        x = np.array([(obj, 1.), (obj, 2.)],
+                      dtype=[('A', object), ('B', float)])
+        y = np.array([10, 20], dtype=int)
+        test = append_fields(x, 'C', data=y, usemask=False)
+        control = np.array([(obj, 1.0, 10), (obj, 2.0, 20)],
+                           dtype=[('A', object), ('B', float), ('C', int)])
+        assert_equal(test, control)
+
+    def test_append_with_objects(self):
+        "Test append_fields when the appended data contains objects"
+        obj = self.data['obj']
+        x = np.array([(10, 1.), (20, 2.)], dtype=[('A', int), ('B', float)])
+        y = np.array([obj, obj], dtype=object)
+        test = append_fields(x, 'C', data=y, dtypes=object, usemask=False)
+        control = np.array([(10, 1.0, obj), (20, 2.0, obj)],
+                           dtype=[('A', int), ('B', float), ('C', object)])
+        assert_equal(test, control)
 
 if __name__ == '__main__':
     run_module_suite()
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 30180f24a..d2e786970 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -23,7 +23,7 @@ from numpy.core import (
     csingle, cdouble, inexact, complexfloating, newaxis, ravel, all, Inf, dot,
     add, multiply, sqrt, maximum, fastCopyAndTranspose, sum, isfinite, size,
     finfo, errstate, geterrobj, longdouble, rollaxis, amin, amax, product, abs,
-    broadcast, atleast_2d, intp, asanyarray
+    broadcast, atleast_2d, intp, asanyarray, isscalar
     )
 from numpy.lib import triu, asfarray
 from numpy.linalg import lapack_lite, _umath_linalg
@@ -382,7 +382,7 @@ def solve(a, b):
     extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
     r = gufunc(a, b, signature=signature, extobj=extobj)
 
-    return wrap(r.astype(result_t))
+    return wrap(r.astype(result_t, copy=False))
 
 
 def tensorinv(a, ind=2):
@@ -522,7 +522,7 @@ def inv(a):
     signature = 'D->D' if isComplexType(t) else 'd->d'
     extobj = get_linalg_error_extobj(_raise_linalgerror_singular)
     ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
-    return wrap(ainv.astype(result_t))
+    return wrap(ainv.astype(result_t, copy=False))
 
 
 # Cholesky decomposition
@@ -606,7 +606,8 @@ def cholesky(a):
     _assertNdSquareness(a)
     t, result_t = _commonType(a)
     signature = 'D->D' if isComplexType(t) else 'd->d'
-    return wrap(gufunc(a, signature=signature, extobj=extobj).astype(result_t))
+    r = gufunc(a, signature=signature, extobj=extobj)
+    return wrap(r.astype(result_t, copy=False))
 
 # QR decompostion
 
@@ -781,7 +782,7 @@ def qr(a, mode='reduced'):
 
     if mode == 'economic':
         if t != result_t :
-            a = a.astype(result_t)
+            a = a.astype(result_t, copy=False)
         return wrap(a.T)
 
     #  generate q from a
@@ -908,7 +909,7 @@ def eigvals(a):
         else:
             result_t = _complexType(result_t)
 
-    return w.astype(result_t)
+    return w.astype(result_t, copy=False)
 
 def eigvalsh(a, UPLO='L'):
     """
@@ -978,7 +979,7 @@ def eigvalsh(a, UPLO='L'):
     t, result_t = _commonType(a)
     signature = 'D->d' if isComplexType(t) else 'd->d'
     w = gufunc(a, signature=signature, extobj=extobj)
-    return w.astype(_realType(result_t))
+    return w.astype(_realType(result_t), copy=False)
 
 def _convertarray(a):
     t, result_t = _commonType(a)
@@ -1124,8 +1125,8 @@ def eig(a):
     else:
         result_t = _complexType(result_t)
 
-    vt = vt.astype(result_t)
-    return w.astype(result_t), wrap(vt)
+    vt = vt.astype(result_t, copy=False)
+    return w.astype(result_t, copy=False), wrap(vt)
 
 
 def eigh(a, UPLO='L'):
@@ -1232,8 +1233,8 @@ def eigh(a, UPLO='L'):
 
     signature = 'D->dD' if isComplexType(t) else 'd->dd'
     w, vt = gufunc(a, signature=signature, extobj=extobj)
-    w = w.astype(_realType(result_t))
-    vt = vt.astype(result_t)
+    w = w.astype(_realType(result_t), copy=False)
+    vt = vt.astype(result_t, copy=False)
     return w, wrap(vt)
 
 
@@ -1344,9 +1345,9 @@ def svd(a, full_matrices=1, compute_uv=1):
 
         signature = 'D->DdD' if isComplexType(t) else 'd->ddd'
         u, s, vt = gufunc(a, signature=signature, extobj=extobj)
-        u = u.astype(result_t)
-        s = s.astype(_realType(result_t))
-        vt = vt.astype(result_t)
+        u = u.astype(result_t, copy=False)
+        s = s.astype(_realType(result_t), copy=False)
+        vt = vt.astype(result_t, copy=False)
         return wrap(u), s, wrap(vt)
     else:
         if m < n:
@@ -1356,7 +1357,7 @@ def svd(a, full_matrices=1, compute_uv=1):
 
         signature = 'D->d' if isComplexType(t) else 'd->d'
         s = gufunc(a, signature=signature, extobj=extobj)
-        s = s.astype(_realType(result_t))
+        s = s.astype(_realType(result_t), copy=False)
         return s
 
 def cond(x, p=None):
@@ -1695,7 +1696,15 @@ def slogdet(a):
     real_t = _realType(result_t)
     signature = 'D->Dd' if isComplexType(t) else 'd->dd'
     sign, logdet = _umath_linalg.slogdet(a, signature=signature)
-    return sign.astype(result_t), logdet.astype(real_t)
+    if isscalar(sign):
+        sign = sign.astype(result_t)
+    else:
+        sign = sign.astype(result_t, copy=False)
+    if isscalar(logdet):
+        logdet = logdet.astype(real_t)
+    else:
+        logdet = logdet.astype(real_t, copy=False)
+    return sign, logdet
 
 def det(a):
     """
@@ -1749,7 +1758,12 @@ def det(a):
     _assertNdSquareness(a)
     t, result_t = _commonType(a)
     signature = 'D->D' if isComplexType(t) else 'd->d'
-    return _umath_linalg.det(a, signature=signature).astype(result_t)
+    r = _umath_linalg.det(a, signature=signature)
+    if isscalar(r):
+        r = r.astype(result_t)
+    else:
+        r = r.astype(result_t, copy=False)
+    return r
 
 # Linear Least Squares
 
@@ -1905,12 +1919,12 @@ def lstsq(a, b, rcond=-1):
         if results['rank'] == n and m > n:
             if isComplexType(t):
                 resids = sum(abs(transpose(bstar)[n:,:])**2, axis=0).astype(
-                    result_real_t)
+                    result_real_t, copy=False)
             else:
                 resids = sum((transpose(bstar)[n:,:])**2, axis=0).astype(
-                    result_real_t)
+                    result_real_t, copy=False)
 
-    st = s[:min(n, m)].copy().astype(result_real_t)
+    st = s[:min(n, m)].astype(result_real_t, copy=True)
     return wrap(x), wrap(resids), results['rank'], st
 
 
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index a09d2c10a..60cada325 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -1128,6 +1128,7 @@ static void
     npy_uint8 *tmp_buff = NULL;
     size_t matrix_size;
     size_t pivot_size;
+    size_t safe_m;
     /* notes:
      *   matrix will need to be copied always, as factorization in lapack is
      *          made inplace
@@ -1138,8 +1139,9 @@ static void
      */
     INIT_OUTER_LOOP_3
     m = (fortran_int) dimensions[0];
-    matrix_size = m*m*sizeof(@typ@);
-    pivot_size = m*sizeof(fortran_int);
+    safe_m = m;
+    matrix_size = safe_m * safe_m * sizeof(@typ@);
+    pivot_size = safe_m * sizeof(fortran_int);
     tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff)
@@ -1172,6 +1174,7 @@ static void
     npy_uint8 *tmp_buff;
     size_t matrix_size;
     size_t pivot_size;
+    size_t safe_m;
     /* notes:
      *   matrix will need to be copied always, as factorization in lapack is
      *       made inplace
@@ -1182,8 +1185,9 @@ static void
      */
     INIT_OUTER_LOOP_2
     m = (fortran_int) dimensions[0];
-    matrix_size = m*m*sizeof(@typ@);
-    pivot_size = m*sizeof(fortran_int);
+    safe_m = m;
+    matrix_size = safe_m * safe_m * sizeof(@typ@);
+    pivot_size = safe_m * sizeof(fortran_int);
     tmp_buff = (npy_uint8 *)malloc(matrix_size + pivot_size);
 
     if (tmp_buff)
@@ -1252,14 +1256,15 @@ init_@lapack_func@(EIGH_PARAMS_t* params, char JOBZ, char UPLO,
     fortran_int liwork = -1;
     fortran_int info;
     npy_uint8 *a, *w, *work, *iwork;
-    size_t alloc_size = N*(N+1)*sizeof(@typ@);
+    size_t safe_N = N;
+    size_t alloc_size = safe_N * (safe_N + 1) * sizeof(@typ@);
 
     mem_buff = malloc(alloc_size);
 
     if (!mem_buff)
         goto error;
     a = mem_buff;
-    w = mem_buff + N*N*sizeof(@typ@);
+    w = mem_buff + safe_N * safe_N * sizeof(@typ@);
     LAPACK(@lapack_func@)(&JOBZ, &UPLO, &N,
                           (@ftyp@*)a, &N, (@ftyp@*)w,
                           &query_work_size, &lwork,
@@ -1344,12 +1349,14 @@ init_@lapack_func@(EIGH_PARAMS_t *params,
     fortran_int liwork = -1;
     npy_uint8 *a, *w, *work, *rwork, *iwork;
     fortran_int info;
+    size_t safe_N = N;
 
-    mem_buff = malloc(N*N*sizeof(@typ@)+N*sizeof(@basetyp@));
+    mem_buff = malloc(safe_N * safe_N * sizeof(@typ@) +
+    	              safe_N * sizeof(@basetyp@));
     if (!mem_buff)
         goto error;
     a = mem_buff;
-    w = mem_buff+N*N*sizeof(@typ@);
+    w = mem_buff + safe_N * safe_N * sizeof(@typ@);
 
     LAPACK(@lapack_func@)(&JOBZ, &UPLO, &N,
                           (@ftyp@*)a, &N, (@fbasetyp@*)w,
@@ -1581,14 +1588,16 @@ init_@lapack_func@(GESV_PARAMS_t *params, fortran_int N, fortran_int NRHS)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *a, *b, *ipiv;
-    mem_buff = malloc(N*N*sizeof(@ftyp@) +
-                      N*NRHS*sizeof(@ftyp@) +
-                      N*sizeof(fortran_int));
+    size_t safe_N = N;
+    size_t safe_NRHS = NRHS;
+    mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@) +
+                      safe_N * safe_NRHS*sizeof(@ftyp@) +
+                      safe_N * sizeof(fortran_int));
     if (!mem_buff)
         goto error;
     a = mem_buff;
-    b = a + N*N*sizeof(@ftyp@);
-    ipiv = b + N*NRHS*sizeof(@ftyp@);
+    b = a + safe_N * safe_N * sizeof(@ftyp@);
+    ipiv = b + safe_N * safe_NRHS * sizeof(@ftyp@);
 
     params->A = a;
     params->B = b;
@@ -1759,8 +1768,9 @@ init_@lapack_func@(POTR_PARAMS_t *params, char UPLO, fortran_int N)
 {
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *a;
+    size_t safe_N = N;
 
-    mem_buff = malloc(N*N*sizeof(@ftyp@));
+    mem_buff = malloc(safe_N * safe_N * sizeof(@ftyp@));
     if (!mem_buff)
         goto error;
 
@@ -1924,11 +1934,12 @@ init_@lapack_func@(GEEV_PARAMS_t *params, char jobvl, char jobvr, fortran_int n)
     npy_uint8 *mem_buff=NULL;
     npy_uint8 *mem_buff2=NULL;
     npy_uint8 *a, *wr, *wi, *vlr, *vrr, *work, *w, *vl, *vr;
-    size_t a_size = n*n*sizeof(@typ@);
-    size_t wr_size = n*sizeof(@typ@);
-    size_t wi_size = n*sizeof(@typ@);
-    size_t vlr_size = jobvl=='V' ? n*n*sizeof(@typ@) : 0;
-    size_t vrr_size = jobvr=='V' ? n*n*sizeof(@typ@) : 0;
+    size_t safe_n = n;
+    size_t a_size = safe_n * safe_n * sizeof(@typ@);
+    size_t wr_size = safe_n * sizeof(@typ@);
+    size_t wi_size = safe_n * sizeof(@typ@);
+    size_t vlr_size = jobvl=='V' ? safe_n * safe_n * sizeof(@typ@) : 0;
+    size_t vrr_size = jobvr=='V' ? safe_n * safe_n * sizeof(@typ@) : 0;
     size_t w_size = wr_size*2;
     size_t vl_size = vlr_size*2;
     size_t vr_size = vrr_size*2;
@@ -2120,11 +2131,12 @@ init_@lapack_func@(GEEV_PARAMS_t* params,
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *w, *vl, *vr, *work, *rwork;
-    size_t a_size = n*n*sizeof(@ftyp@);
-    size_t w_size = n*sizeof(@ftyp@);
-    size_t vl_size = jobvl=='V'? n*n*sizeof(@ftyp@) : 0;
-    size_t vr_size = jobvr=='V'? n*n*sizeof(@ftyp@) : 0;
-    size_t rwork_size = 2*n*sizeof(@realtyp@);
+    size_t safe_n = n;
+    size_t a_size = safe_n * safe_n * sizeof(@ftyp@);
+    size_t w_size = safe_n * sizeof(@ftyp@);
+    size_t vl_size = jobvl=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0;
+    size_t vr_size = jobvr=='V'? safe_n * safe_n * sizeof(@ftyp@) : 0;
+    size_t rwork_size = 2 * safe_n * sizeof(@realtyp@);
     size_t work_count = 0;
     @typ@ work_size_query;
     fortran_int do_size_query = -1;
@@ -2446,20 +2458,27 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     npy_uint8 *mem_buff = NULL;
     npy_uint8 *mem_buff2 = NULL;
     npy_uint8 *a, *s, *u, *vt, *work, *iwork;
-    size_t a_size = (size_t)m*(size_t)n*sizeof(@ftyp@);
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
     fortran_int min_m_n = m<n?m:n;
-    size_t s_size = ((size_t)min_m_n)*sizeof(@ftyp@);
-    fortran_int  u_row_count, vt_column_count;
+    size_t safe_min_m_n = min_m_n;
+    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+    fortran_int u_row_count, vt_column_count;
+    size_t safe_u_row_count, safe_vt_column_count;
     size_t u_size, vt_size;
     fortran_int work_count;
     size_t work_size;
-    size_t iwork_size = 8*((size_t)min_m_n)*sizeof(fortran_int);
+    size_t iwork_size = 8 * safe_min_m_n * sizeof(fortran_int);
 
     if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count))
         goto error;
 
-    u_size = ((size_t)u_row_count)*m*sizeof(@ftyp@);
-    vt_size = n*((size_t)vt_column_count)*sizeof(@ftyp@);
+    safe_u_row_count = u_row_count;
+    safe_vt_column_count = vt_column_count;
+
+    u_size = safe_u_row_count * safe_m * sizeof(@ftyp@);
+    vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@);
 
     mem_buff = malloc(a_size + s_size + u_size + vt_size + iwork_size);
 
@@ -2557,21 +2576,28 @@ init_@lapack_func@(GESDD_PARAMS_t *params,
     npy_uint8 *mem_buff = NULL, *mem_buff2 = NULL;
     npy_uint8 *a,*s, *u, *vt, *work, *rwork, *iwork;
     size_t a_size, s_size, u_size, vt_size, work_size, rwork_size, iwork_size;
+    size_t safe_u_row_count, safe_vt_column_count;
     fortran_int u_row_count, vt_column_count, work_count;
+    size_t safe_m = m;
+    size_t safe_n = n;
     fortran_int min_m_n = m<n?m:n;
+    size_t safe_min_m_n = min_m_n;
 
     if (!compute_urows_vtcolumns(jobz, m, n, &u_row_count, &vt_column_count))
         goto error;
 
-    a_size = ((size_t)m)*((size_t)n)*sizeof(@ftyp@);
-    s_size = ((size_t)min_m_n)*sizeof(@frealtyp@);
-    u_size = ((size_t)u_row_count)*m*sizeof(@ftyp@);
-    vt_size = n*((size_t)vt_column_count)*sizeof(@ftyp@);
+    safe_u_row_count = u_row_count;
+    safe_vt_column_count = vt_column_count;
+
+    a_size = safe_m * safe_n * sizeof(@ftyp@);
+    s_size = safe_min_m_n * sizeof(@frealtyp@);
+    u_size = safe_u_row_count * safe_m * sizeof(@ftyp@);
+    vt_size = safe_n * safe_vt_column_count * sizeof(@ftyp@);
     rwork_size = 'N'==jobz?
-        7*((size_t)min_m_n) :
-        (5*(size_t)min_m_n*(size_t)min_m_n + 5*(size_t)min_m_n);
+        (7 * safe_min_m_n) :
+        (5*safe_min_m_n * safe_min_m_n + 5*safe_min_m_n);
     rwork_size *= sizeof(@ftyp@);
-    iwork_size = 8*((size_t)min_m_n)*sizeof(fortran_int);
+    iwork_size = 8 * safe_min_m_n* sizeof(fortran_int);
 
     mem_buff = malloc(a_size +
                       s_size +
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index 45fb8c98b..5df928a6d 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -3044,8 +3044,7 @@ class MaskedArray(ndarray):
 #        if getmask(indx) is not nomask:
 #            msg = "Masked arrays must be filled before they can be used as indices!"
 #            raise IndexError(msg)
-        _data = ndarray.view(self, ndarray)
-        dout = ndarray.__getitem__(_data, indx)
+        dout = self.data[indx]
         # We could directly use ndarray.__getitem__ on self...
         # But then we would have to modify __array_finalize__ to prevent the
         # mask of being reshaped if it hasn't been set up properly yet...
@@ -3075,6 +3074,8 @@ class MaskedArray(ndarray):
             # Update the mask if needed
             if _mask is not nomask:
                 dout._mask = _mask[indx]
+                # set shape to match that of data; this is needed for matrices
+                dout._mask.shape = dout.shape
                 dout._sharedmask = True
 #               Note: Don't try to check for m.any(), that'll take too long...
         return dout
@@ -3092,16 +3093,16 @@ class MaskedArray(ndarray):
 #        if getmask(indx) is not nomask:
 #            msg = "Masked arrays must be filled before they can be used as indices!"
 #            raise IndexError(msg)
-        _data = ndarray.view(self, ndarray.__getattribute__(self, '_baseclass'))
-        _mask = ndarray.__getattribute__(self, '_mask')
+        _data = self._data
+        _mask = self._mask
         if isinstance(indx, basestring):
-            ndarray.__setitem__(_data, indx, value)
+            _data[indx] = value
             if _mask is nomask:
                 self._mask = _mask = make_mask_none(self.shape, self.dtype)
             _mask[indx] = getmask(value)
             return
         #........................................
-        _dtype = ndarray.__getattribute__(_data, 'dtype')
+        _dtype = _data.dtype
         nbfields = len(_dtype.names or ())
         #........................................
         if value is masked:
@@ -3125,21 +3126,21 @@ class MaskedArray(ndarray):
             mval = tuple([False] * nbfields)
         if _mask is nomask:
             # Set the data, then the mask
-            ndarray.__setitem__(_data, indx, dval)
+            _data[indx] = dval
             if mval is not nomask:
                 _mask = self._mask = make_mask_none(self.shape, _dtype)
-                ndarray.__setitem__(_mask, indx, mval)
+                _mask[indx] = mval
         elif not self._hardmask:
             # Unshare the mask if necessary to avoid propagation
             if not self._isfield:
                 self.unshare_mask()
-                _mask = ndarray.__getattribute__(self, '_mask')
+                _mask = self._mask
             # Set the data, then the mask
-            ndarray.__setitem__(_data, indx, dval)
-            ndarray.__setitem__(_mask, indx, mval)
+            _data[indx] = dval
+            _mask[indx] = mval
         elif hasattr(indx, 'dtype') and (indx.dtype == MaskType):
             indx = indx * umath.logical_not(_mask)
-            ndarray.__setitem__(_data, indx, dval)
+            _data[indx] = dval
         else:
             if nbfields:
                 err_msg = "Flexible 'hard' masks are not yet supported..."
@@ -3150,7 +3151,7 @@ class MaskedArray(ndarray):
                 np.copyto(dindx, dval, where=~mindx)
             elif mindx is nomask:
                 dindx = dval
-            ndarray.__setitem__(_data, indx, dindx)
+            _data[indx] = dindx
             _mask[indx] = mindx
         return
 
diff --git a/numpy/ma/tests/test_core.py b/numpy/ma/tests/test_core.py
index f8a28164e..6df235e9f 100644
--- a/numpy/ma/tests/test_core.py
+++ b/numpy/ma/tests/test_core.py
@@ -275,6 +275,49 @@ class TestMaskedArray(TestCase):
         assert_equal(s1, s2)
         assert_(x1[1:1].shape == (0,))
 
+    def test_matrix_indexing(self):
+        # Tests conversions and indexing
+        x1 = np.matrix([[1, 2, 3], [4, 3, 2]])
+        x2 = array(x1, mask=[[1, 0, 0], [0, 1, 0]])
+        x3 = array(x1, mask=[[0, 1, 0], [1, 0, 0]])
+        x4 = array(x1)
+        # test conversion to strings
+        junk, garbage = str(x2), repr(x2)
+        # assert_equal(np.sort(x1), sort(x2, endwith=False))
+        # tests of indexing
+        assert_(type(x2[1, 0]) is type(x1[1, 0]))
+        assert_(x1[1, 0] == x2[1, 0])
+        assert_(x2[1, 1] is masked)
+        assert_equal(x1[0, 2], x2[0, 2])
+        assert_equal(x1[0, 1:], x2[0, 1:])
+        assert_equal(x1[:, 2], x2[:, 2])
+        assert_equal(x1[:], x2[:])
+        assert_equal(x1[1:], x3[1:])
+        x1[0, 2] = 9
+        x2[0, 2] = 9
+        assert_equal(x1, x2)
+        x1[0, 1:] = 99
+        x2[0, 1:] = 99
+        assert_equal(x1, x2)
+        x2[0, 1] = masked
+        assert_equal(x1, x2)
+        x2[0, 1:] = masked
+        assert_equal(x1, x2)
+        x2[0, :] = x1[0, :]
+        x2[0, 1] = masked
+        assert_(allequal(getmask(x2), np.array([[0, 1, 0], [0, 1, 0]])))
+        x3[1, :] = masked_array([1, 2, 3], [1, 1, 0])
+        assert_(allequal(getmask(x3)[1], array([1, 1, 0])))
+        assert_(allequal(getmask(x3[1]), array([1, 1, 0])))
+        x4[1, :] = masked_array([1, 2, 3], [1, 1, 0])
+        assert_(allequal(getmask(x4[1]), array([1, 1, 0])))
+        assert_(allequal(x4[1], array([1, 2, 3])))
+        x1 = np.matrix(np.arange(5) * 1.0)
+        x2 = masked_values(x1, 3.0)
+        assert_equal(x1, x2)
+        assert_(allequal(array([0, 0, 0, 1, 0], MaskType), x2.mask))
+        assert_equal(3.0, x2.fill_value)
+
     def test_copy(self):
         # Tests of some subtle points of copying and sizing.
         n = [0, 0, 1, 0, 0]
diff --git a/numpy/ma/tests/test_subclassing.py b/numpy/ma/tests/test_subclassing.py
index ade5c59da..07fc8fdd6 100644
--- a/numpy/ma/tests/test_subclassing.py
+++ b/numpy/ma/tests/test_subclassing.py
@@ -84,20 +84,71 @@ mmatrix = MMatrix
 
 # also a subclass that overrides __str__, __repr__ and __setitem__, disallowing
 # setting to non-class values (and thus np.ma.core.masked_print_option)
+class CSAIterator(object):
+    """
+    Flat iterator object that uses its own setter/getter
+    (works around ndarray.flat not propagating subclass setters/getters
+    see https://github.com/numpy/numpy/issues/4564)
+    roughly following MaskedIterator
+    """
+    def __init__(self, a):
+        self._original = a
+        self._dataiter = a.view(np.ndarray).flat
+
+    def __iter__(self):
+        return self
+
+    def __getitem__(self, indx):
+        out = self._dataiter.__getitem__(indx)
+        if not isinstance(out, np.ndarray):
+            out = out.__array__()
+        out = out.view(type(self._original))
+        return out
+
+    def __setitem__(self, index, value):
+        self._dataiter[index] = self._original._validate_input(value)
+
+    def __next__(self):
+        return next(self._dataiter).__array__().view(type(self._original))
+
+    next = __next__
+
+
 class ComplicatedSubArray(SubArray):
+
     def __str__(self):
-        return 'myprefix {0} mypostfix'.format(
-            super(ComplicatedSubArray, self).__str__())
+        return 'myprefix {0} mypostfix'.format(self.view(SubArray))
 
     def __repr__(self):
         # Return a repr that does not start with 'name('
         return '<{0} {1}>'.format(self.__class__.__name__, self)
 
-    def __setitem__(self, item, value):
-        # this ensures direct assignment to masked_print_option will fail
+    def _validate_input(self, value):
         if not isinstance(value, ComplicatedSubArray):
             raise ValueError("Can only set to MySubArray values")
-        super(ComplicatedSubArray, self).__setitem__(item, value)
+        return value
+
+    def __setitem__(self, item, value):
+        # validation ensures direct assignment with ndarray or
+        # masked_print_option will fail
+        super(ComplicatedSubArray, self).__setitem__(
+            item, self._validate_input(value))
+
+    def __getitem__(self, item):
+        # ensure getter returns our own class also for scalars
+        value = super(ComplicatedSubArray, self).__getitem__(item)
+        if not isinstance(value, np.ndarray):  # scalar
+            value = value.__array__().view(ComplicatedSubArray)
+        return value
+
+    @property
+    def flat(self):
+        return CSAIterator(self)
+
+    @flat.setter
+    def flat(self, value):
+        y = self.ravel()
+        y[:] = value
 
 
 class TestSubclassing(TestCase):
@@ -205,6 +256,38 @@ class TestSubclassing(TestCase):
         assert_equal(mxsub.info, xsub.info)
         assert_equal(mxsub._mask, m)
 
+    def test_subclass_items(self):
+        """test that getter and setter go via baseclass"""
+        x = np.arange(5)
+        xcsub = ComplicatedSubArray(x)
+        mxcsub = masked_array(xcsub, mask=[True, False, True, False, False])
+        # getter should  return a ComplicatedSubArray, even for single item
+        # first check we wrote ComplicatedSubArray correctly
+        self.assertTrue(isinstance(xcsub[1], ComplicatedSubArray))
+        self.assertTrue(isinstance(xcsub[1:4], ComplicatedSubArray))
+        # now that it propagates inside the MaskedArray
+        self.assertTrue(isinstance(mxcsub[1], ComplicatedSubArray))
+        self.assertTrue(mxcsub[0] is masked)
+        self.assertTrue(isinstance(mxcsub[1:4].data, ComplicatedSubArray))
+        # also for flattened version (which goes via MaskedIterator)
+        self.assertTrue(isinstance(mxcsub.flat[1].data, ComplicatedSubArray))
+        self.assertTrue(mxcsub[0] is masked)
+        self.assertTrue(isinstance(mxcsub.flat[1:4].base, ComplicatedSubArray))
+
+        # setter should only work with ComplicatedSubArray input
+        # first check we wrote ComplicatedSubArray correctly
+        assert_raises(ValueError, xcsub.__setitem__, 1, x[4])
+        # now that it propagates inside the MaskedArray
+        assert_raises(ValueError, mxcsub.__setitem__, 1, x[4])
+        assert_raises(ValueError, mxcsub.__setitem__, slice(1, 4), x[1:4])
+        mxcsub[1] = xcsub[4]
+        mxcsub[1:4] = xcsub[1:4]
+        # also for flattened version (which goes via MaskedIterator)
+        assert_raises(ValueError, mxcsub.flat.__setitem__, 1, x[4])
+        assert_raises(ValueError, mxcsub.flat.__setitem__, slice(1, 4), x[1:4])
+        mxcsub.flat[1] = xcsub[4]
+        mxcsub.flat[1:4] = xcsub[1:4]
+
     def test_subclass_repr(self):
         """test that repr uses the name of the subclass
         and 'array' for np.ndarray"""