18 files changed, 812 insertions, 361 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index bc034c3e9..668aee935 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -1513,129 +1513,6 @@ add_newdoc('numpy.core.multiarray', 'getbuffer',
 
     """)
 
-add_newdoc('numpy.core', 'matmul',
-    """
-    matmul(a, b, out=None)
-
-    Matrix product of two arrays.
-
-    The behavior depends on the arguments in the following way.
-
-    - If both arguments are 2-D they are multiplied like conventional
-      matrices.
-    - If either argument is N-D, N > 2, it is treated as a stack of
-      matrices residing in the last two indexes and broadcast accordingly.
-    - If the first argument is 1-D, it is promoted to a matrix by
-      prepending a 1 to its dimensions. After matrix multiplication
-      the prepended 1 is removed.
-    - If the second argument is 1-D, it is promoted to a matrix by
-      appending a 1 to its dimensions. After matrix multiplication
-      the appended 1 is removed.
-
-    Multiplication by a scalar is not allowed, use ``*`` instead. Note that
-    multiplying a stack of matrices with a vector will result in a stack of
-    vectors, but matmul will not recognize it as such.
-
-    ``matmul`` differs from ``dot`` in two important ways.
-
-    - Multiplication by scalars is not allowed.
-    - Stacks of matrices are broadcast together as if the matrices
-      were elements.
-
-    .. warning::
-       This function is preliminary and included in NumPy 1.10.0 for testing
-       and documentation. Its semantics will not change, but the number and
-       order of the optional arguments will.
-
-    .. versionadded:: 1.10.0
-
-    Parameters
-    ----------
-    a : array_like
-        First argument.
-    b : array_like
-        Second argument.
-    out : ndarray, optional
-        Output argument. This must have the exact kind that would be returned
-        if it was not used. In particular, it must have the right type, must be
-        C-contiguous, and its dtype must be the dtype that would be returned
-        for `dot(a,b)`. This is a performance feature. Therefore, if these
-        conditions are not met, an exception is raised, instead of attempting
-        to be flexible.
-
-    Returns
-    -------
-    output : ndarray
-        Returns the dot product of `a` and `b`.  If `a` and `b` are both
-        1-D arrays then a scalar is returned; otherwise an array is
-        returned.  If `out` is given, then it is returned.
-
-    Raises
-    ------
-    ValueError
-        If the last dimension of `a` is not the same size as
-        the second-to-last dimension of `b`.
-
-        If scalar value is passed.
-
-    See Also
-    --------
-    vdot : Complex-conjugating dot product.
-    tensordot : Sum products over arbitrary axes.
-    einsum : Einstein summation convention.
-    dot : alternative matrix product with different broadcasting rules.
-
-    Notes
-    -----
-    The matmul function implements the semantics of the `@` operator introduced
-    in Python 3.5 following PEP465.
-
-    Examples
-    --------
-    For 2-D arrays it is the matrix product:
-
-    >>> a = [[1, 0], [0, 1]]
-    >>> b = [[4, 1], [2, 2]]
-    >>> np.matmul(a, b)
-    array([[4, 1],
-           [2, 2]])
-
-    For 2-D mixed with 1-D, the result is the usual.
-
-    >>> a = [[1, 0], [0, 1]]
-    >>> b = [1, 2]
-    >>> np.matmul(a, b)
-    array([1, 2])
-    >>> np.matmul(b, a)
-    array([1, 2])
-
-
-    Broadcasting is conventional for stacks of arrays
-
-    >>> a = np.arange(2*2*4).reshape((2,2,4))
-    >>> b = np.arange(2*2*4).reshape((2,4,2))
-    >>> np.matmul(a,b).shape
-    (2, 2, 2)
-    >>> np.matmul(a,b)[0,1,1]
-    98
-    >>> sum(a[0,1,:] * b[0,:,1])
-    98
-
-    Vector, vector returns the scalar inner product, but neither argument
-    is complex-conjugated:
-
-    >>> np.matmul([2j, 3j], [2j, 3j])
-    (-13+0j)
-
-    Scalar multiplication raises an error.
-
-    >>> np.matmul([1,2], 3)
-    Traceback (most recent call last):
-    ...
-    ValueError: Scalar operands are not allowed, use '*' instead
-
-    """)
-
 add_newdoc('numpy.core.multiarray', 'c_einsum',
     """
     c_einsum(subscripts, *operands, out=None, dtype=None, order='K',
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 9d4e72c0e..a092cb95e 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -127,7 +127,7 @@ class Ufunc(object):
     type_descriptions : list of TypeDescription objects
     """
     def __init__(self, nin, nout, identity, docstring, typereso,
-                 *type_descriptions):
+                 *type_descriptions, **kwargs):
         self.nin = nin
         self.nout = nout
         if identity is None:
@@ -136,10 +136,13 @@ class Ufunc(object):
         self.docstring = docstring
         self.typereso = typereso
         self.type_descriptions = []
+        self.signature = kwargs.pop('signature', 'NULL')
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
             td.finish_signature(self.nin, self.nout)
+        if kwargs:
+            raise ValueError('unknown kwargs %r' % str(kwargs))
 
 # String-handling utilities to avoid locale-dependence.
 
@@ -904,7 +907,14 @@ defdict = {
           "PyUFunc_SimpleBinaryOperationTypeResolver",
           TD(ints),
           TD('O', f='npy_ObjectLCM'),
-          )
+          ),
+'matmul' :
+    Ufunc(2, 1, None,
+          docstrings.get('numpy.core.umath.matmul'),
+          "PyUFunc_MatmulTypeResolver",
+          TD(notimes_or_obj),
+          signature='"(n?,k),(k,m?)->(n?,m?)"',
+          ),
 }
 
 if sys.version_info[0] >= 3:
@@ -1058,7 +1068,7 @@ def make_ufuncs(funcdict):
             f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
                 {name}_functions, {name}_data, {name}_signatures, {nloops},
                 {nin}, {nout}, {identity}, "{name}",
-                "{doc}", 0, NULL, identity
+                "{doc}", 0, {signature}, identity
             );
             if ({has_identity}) {{
                 Py_DECREF(identity);
@@ -1073,7 +1083,8 @@ def make_ufuncs(funcdict):
             has_identity='0' if uf.identity is None_ else '1',
             identity='PyUFunc_IdentityValue',
             identity_expr=uf.identity,
-            doc=docstring
+            doc=docstring,
+            signature=uf.signature,
         )
 
         # Only PyUFunc_None means don't reorder - we pass this using the old
@@ -1106,6 +1117,8 @@ def make_code(funcdict, filename):
     #include "cpuid.h"
     #include "ufunc_object.h"
     #include "ufunc_type_resolution.h"
+    #include "loops.h"
+    #include "matmul.h"
     %s
 
     static int
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 13231de29..d79702a18 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -39,7 +39,8 @@ subst = {
 def add_newdoc(place, name, doc):
     doc = textwrap.dedent(doc).strip()
 
-    if name[0] != '_':
+    if name[0] != '_' and name != 'matmul':
+        # matmul is special, it does not use the OUT_SCALAR replacement strings
         if '\nx :' in doc:
             assert '$OUT_SCALAR_1' in doc, "in {}".format(name)
         elif '\nx2 :' in doc or '\nx1, x2 :' in doc:
@@ -2541,6 +2542,128 @@ add_newdoc('numpy.core.umath', 'fmin',
 
     """)
 
+add_newdoc('numpy.core.umath', 'matmul',
+    """
+    Matrix product of two arrays.
+
+    Parameters
+    ----------
+    x1, x2 : array_like
+        Input arrays, scalars not allowed.
+    out : ndarray, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that matches the signature `(n,k),(k,m)->(n,m)`. If not
+        provided or `None`, a freshly-allocated array is returned.
+    **kwargs
+        For other keyword-only arguments, see the
+        :ref:`ufunc docs <ufuncs.kwargs>`.
+
+        ..versionchanged:: 1.16
+          Now handles ufunc kwargs
+
+    Returns
+    -------
+    y : ndarray
+        The matrix product of the inputs.
+        This is a scalar only when both x1, x2 are 1-d vectors.
+
+    Raises
+    ------
+    ValueError
+        If the last dimension of `a` is not the same size as
+        the second-to-last dimension of `b`.
+
+        If a scalar value is passed in.
+
+    See Also
+    --------
+    vdot : Complex-conjugating dot product.
+    tensordot : Sum products over arbitrary axes.
+    einsum : Einstein summation convention.
+    dot : alternative matrix product with different broadcasting rules.
+
+    Notes
+    -----
+
+    The behavior depends on the arguments in the following way.
+
+    - If both arguments are 2-D they are multiplied like conventional
+      matrices.
+    - If either argument is N-D, N > 2, it is treated as a stack of
+      matrices residing in the last two indexes and broadcast accordingly.
+    - If the first argument is 1-D, it is promoted to a matrix by
+      prepending a 1 to its dimensions. After matrix multiplication
+      the prepended 1 is removed.
+    - If the second argument is 1-D, it is promoted to a matrix by
+      appending a 1 to its dimensions. After matrix multiplication
+      the appended 1 is removed.
+
+    ``matmul`` differs from ``dot`` in two important ways.
+
+    - Multiplication by scalars is not allowed, use ``*`` instead.
+    - Stacks of matrices are broadcast together as if the matrices
+      were elements, respecting the signature `(n,k),(k,m)->(n,m)`:
+
+      >>> a = a = np.full([9,5,7,3], True, dtype=bool)
+      >>> c = np.full([9, 5, 4,3], True, dtype=bool)
+      >>> np.dot(a, c).shape
+      (9, 5, 7, 9, 5, 4)
+      >>> np.matmul(a, c).shape # n is 5, k is 3, m is 4
+      (9, 5, 7, 4)
+
+    The matmul function implements the semantics of the `@` operator introduced
+    in Python 3.5 following PEP465.
+
+    Examples
+    --------
+    For 2-D arrays it is the matrix product:
+
+    >>> a = np.array([[1, 0],
+    ...               [0, 1]])
+    >>> b = np.array([[4, 1], 
+    ...               [2, 2]]
+    >>> np.matmul(a, b)
+    array([[4, 1],
+           [2, 2]])
+
+    For 2-D mixed with 1-D, the result is the usual.
+
+    >>> a = np.array([[1, 0],
+    ...               [0, 1]]
+    >>> b = np.array([1, 2])
+    >>> np.matmul(a, b)
+    array([1, 2])
+    >>> np.matmul(b, a)
+    array([1, 2])
+
+
+    Broadcasting is conventional for stacks of arrays
+
+    >>> a = np.arange(2*2*4).reshape((2,2,4))
+    >>> b = np.arange(2*2*4).reshape((2,4,2))
+    >>> np.matmul(a,b).shape
+    (2, 2, 2)
+    >>> np.matmul(a,b)[0,1,1]
+    98
+    >>> sum(a[0,1,:] * b[0,:,1])
+    98
+
+    Vector, vector returns the scalar inner product, but neither argument
+    is complex-conjugated:
+
+    >>> np.matmul([2j, 3j], [2j, 3j])
+    (-13+0j)
+
+    Scalar multiplication raises an error.
+
+    >>> np.matmul([1,2], 3)
+    Traceback (most recent call last):
+    ...
+    ValueError: Scalar operands are not allowed, use '*' instead
+
+    .. versionadded:: 1.10.0
+    """)
+
 add_newdoc('numpy.core.umath', 'modf',
     """
     Return the fractional and integral parts of an array, element-wise.
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 78963b0aa..d4d5922cd 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -51,7 +51,6 @@ fromiter.__module__ = 'numpy'
 frompyfunc.__module__ = 'numpy'
 fromstring.__module__ = 'numpy'
 geterrobj.__module__ = 'numpy'
-matmul.__module__ = 'numpy'
 may_share_memory.__module__ = 'numpy'
 nested_iters.__module__ = 'numpy'
 promote_types.__module__ = 'numpy'
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 23a9e268b..467b590ac 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -732,6 +732,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'cblasfuncs.h'),
             join('src', 'common', 'lowlevel_strided_loops.h'),
             join('src', 'common', 'mem_overlap.h'),
+            join('src', 'common', 'npy_cblas.h'),
             join('src', 'common', 'npy_config.h'),
             join('src', 'common', 'npy_ctypes.h'),
             join('src', 'common', 'npy_extint128.h'),
@@ -892,6 +893,8 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'simd.inc.src'),
             join('src', 'umath', 'loops.h.src'),
             join('src', 'umath', 'loops.c.src'),
+            join('src', 'umath', 'matmul.h.src'),
+            join('src', 'umath', 'matmul.c.src'),
             join('src', 'umath', 'ufunc_object.c'),
             join('src', 'umath', 'extobj.c'),
             join('src', 'umath', 'cpuid.c'),
diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c
index 6460c5db1..514297940 100644
--- a/numpy/core/src/common/cblasfuncs.c
+++ b/numpy/core/src/common/cblasfuncs.c
@@ -182,7 +182,7 @@ _select_matrix_shape(PyArrayObject *array)
  * This also makes sure that the data segment is aligned with
  * an itemsize address as well by returning one if not true.
  */
-static int
+NPY_NO_EXPORT int
 _bad_strides(PyArrayObject *ap)
 {
     int itemsize = PyArray_ITEMSIZE(ap);
diff --git a/numpy/core/src/common/cblasfuncs.h b/numpy/core/src/common/cblasfuncs.h
index 66ce4ca5b..78eff25a0 100644
--- a/numpy/core/src/common/cblasfuncs.h
+++ b/numpy/core/src/common/cblasfuncs.h
@@ -3,5 +3,7 @@
 
 NPY_NO_EXPORT PyObject *
 cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT int
+_bad_strides(PyArrayObject *ap);
 
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 909a24359..5ccb7f6d6 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -72,7 +72,6 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
  *****************************************************************************
  */
 #include "funcs.inc"
-#include "loops.h"
 #include "umathmodule.h"
 
 NPY_NO_EXPORT int initscalarmath(PyObject *);
@@ -2318,157 +2317,6 @@ fail:
     return NULL;
 }
 
-
-
-/*
- * matmul
- *
- * Implements the protocol used by the '@' operator defined in PEP 364.
- * Not in the NUMPY API at this time, maybe later.
- *
- *
- * in1:        Left hand side operand
- * in2:        Right hand side operand
- * out:        Either NULL, or an array into which the output should be placed.
- *
- * Returns NULL on error.
- */
-static PyObject *
-array_matmul(PyObject *NPY_UNUSED(m), PyObject *args, PyObject* kwds)
-{
-    PyObject *in1, *in2, *out = NULL;
-    char* kwlist[] = {"a", "b", "out", NULL };
-    PyArrayObject *ap1, *ap2, *ret = NULL;
-    NPY_ORDER order = NPY_KEEPORDER;
-    NPY_CASTING casting = NPY_SAFE_CASTING;
-    PyArray_Descr *dtype;
-    int nd1, nd2, typenum;
-    char *subscripts;
-    PyArrayObject *ops[2];
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O:matmul", kwlist,
-                                     &in1, &in2, &out)) {
-        return NULL;
-    }
-
-    if (out != NULL) {
-        if (out == Py_None) {
-            out = NULL;
-        }
-        else if (!PyArray_Check(out)) {
-            PyErr_SetString(PyExc_TypeError, "'out' must be an array");
-            return NULL;
-        }
-    }
-
-    dtype = PyArray_DescrFromObject(in1, NULL);
-    dtype = PyArray_DescrFromObject(in2, dtype);
-    if (dtype == NULL) {
-        if (!PyErr_Occurred()) {
-            PyErr_SetString(PyExc_ValueError,
-                            "Cannot find a common data type.");
-        }
-        return NULL;
-    }
-    typenum = dtype->type_num;
-
-    if (typenum == NPY_OBJECT) {
-        /* matmul is not currently implemented for object arrays */
-        PyErr_SetString(PyExc_TypeError,
-                        "Object arrays are not currently supported");
-        Py_DECREF(dtype);
-        return NULL;
-    }
-
-    ap1 = (PyArrayObject *)PyArray_FromAny(in1, dtype, 0, 0,
-                                           NPY_ARRAY_ALIGNED, NULL);
-    if (ap1 == NULL) {
-        return NULL;
-    }
-
-    Py_INCREF(dtype);
-    ap2 = (PyArrayObject *)PyArray_FromAny(in2, dtype, 0, 0,
-                                           NPY_ARRAY_ALIGNED, NULL);
-    if (ap2 == NULL) {
-        Py_DECREF(ap1);
-        return NULL;
-    }
-
-    if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
-        /* Scalars are rejected */
-        PyErr_SetString(PyExc_ValueError,
-                        "Scalar operands are not allowed, use '*' instead");
-        return NULL;
-    }
-
-    nd1 = PyArray_NDIM(ap1);
-    nd2 = PyArray_NDIM(ap2);
-
-#if defined(HAVE_CBLAS)
-    if (nd1 <= 2 && nd2 <= 2 &&
-            (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
-             NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
-        return cblas_matrixproduct(typenum, ap1, ap2, (PyArrayObject *)out);
-    }
-#endif
-
-    /*
-     * Use einsum for the stacked cases. This is a quick implementation
-     * to avoid setting up the proper iterators. Einsum broadcasts, so
-     * we need to check dimensions before the call.
-     */
-    if (nd1 == 1 && nd2 == 1) {
-        /* vector vector */
-        if (PyArray_DIM(ap1, 0) != PyArray_DIM(ap2, 0)) {
-            dot_alignment_error(ap1, 0, ap2, 0);
-            goto fail;
-        }
-        subscripts = "i, i";
-    }
-    else if (nd1 == 1) {
-        /* vector  matrix */
-        if (PyArray_DIM(ap1, 0) != PyArray_DIM(ap2, nd2 - 2)) {
-            dot_alignment_error(ap1, 0, ap2, nd2 - 2);
-            goto fail;
-        }
-        subscripts = "i, ...ij";
-    }
-    else if (nd2 == 1) {
-        /* matrix  vector */
-        if (PyArray_DIM(ap1, nd1 - 1) != PyArray_DIM(ap2, 0)) {
-            dot_alignment_error(ap1, nd1 - 1, ap2, 0);
-            goto fail;
-        }
-        subscripts = "...i, i";
-    }
-    else {
-        /* matrix * matrix */
-        if (PyArray_DIM(ap1, nd1 - 1) != PyArray_DIM(ap2, nd2 - 2)) {
-            dot_alignment_error(ap1, nd1 - 1, ap2, nd2 - 2);
-            goto fail;
-        }
-        subscripts = "...ij, ...jk";
-    }
-    ops[0] = ap1;
-    ops[1] = ap2;
-    ret = PyArray_EinsteinSum(subscripts, 2, ops, NULL, order, casting,
-            (PyArrayObject *)out);
-    Py_DECREF(ap1);
-    Py_DECREF(ap2);
-
-    /* If no output was supplied, possibly convert to a scalar */
-    if (ret != NULL && out == NULL) {
-        return PyArray_Return((PyArrayObject *)ret);
-    }
-    return (PyObject *)ret;
-
-fail:
-    Py_XDECREF(ap1);
-    Py_XDECREF(ap2);
-    return NULL;
-}
-
-
 static int
 einsum_sub_op_from_str(PyObject *args, PyObject **str_obj, char **subscripts,
                        PyArrayObject **op)
@@ -4285,9 +4133,6 @@ static struct PyMethodDef array_module_methods[] = {
     {"vdot",
         (PyCFunction)array_vdot,
         METH_VARARGS | METH_KEYWORDS, NULL},
-    {"matmul",
-        (PyCFunction)array_matmul,
-        METH_VARARGS | METH_KEYWORDS, NULL},
     {"c_einsum",
         (PyCFunction)array_einsum,
         METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 5ee536d4f..d153a8a64 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -112,6 +112,7 @@ _PyArray_SetNumericOps(PyObject *dict)
     SET(minimum);
     SET(rint);
     SET(conjugate);
+    SET(matmul);
     return 0;
 }
 
@@ -177,6 +178,7 @@ _PyArray_GetNumericOps(void)
     GET(minimum);
     GET(rint);
     GET(conjugate);
+    GET(matmul);
     return dict;
 
  fail:
@@ -382,14 +384,8 @@ array_divmod(PyArrayObject *m1, PyObject *m2)
 static PyObject *
 array_matrix_multiply(PyArrayObject *m1, PyObject *m2)
 {
-    static PyObject *matmul = NULL;
-
-    npy_cache_import("numpy.core.multiarray", "matmul", &matmul);
-    if (matmul == NULL) {
-        return NULL;
-    }
     BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_matrix_multiply, array_matrix_multiply);
-    return PyArray_GenericBinaryFunction(m1, m2, matmul);
+    return PyArray_GenericBinaryFunction(m1, m2, n_ops.matmul);
 }
 
 static PyObject *
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index fbdfe6f94..33a7cf872 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -39,6 +39,7 @@ typedef struct {
     PyObject *minimum;
     PyObject *rint;
     PyObject *conjugate;
+    PyObject *matmul;
 } NumericOps;
 
 extern NPY_NO_EXPORT NumericOps n_ops;
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 0f201b966..2f71c8ae9 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -1104,8 +1104,7 @@ static PyNumberMethods gentype_as_number = {
     (binaryfunc)gentype_add,                     /*nb_add*/
     (binaryfunc)gentype_subtract,                /*nb_subtract*/
     (binaryfunc)gentype_multiply,                /*nb_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     (binaryfunc)gentype_divide,                  /*nb_divide*/
 #endif
     (binaryfunc)gentype_remainder,               /*nb_remainder*/
@@ -1121,8 +1120,7 @@ static PyNumberMethods gentype_as_number = {
     (binaryfunc)gentype_and,                     /*nb_and*/
     (binaryfunc)gentype_xor,                     /*nb_xor*/
     (binaryfunc)gentype_or,                      /*nb_or*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     0,                                           /*nb_coerce*/
 #endif
     (unaryfunc)gentype_int,                      /*nb_int*/
@@ -1132,16 +1130,14 @@ static PyNumberMethods gentype_as_number = {
     (unaryfunc)gentype_long,                     /*nb_long*/
 #endif
     (unaryfunc)gentype_float,                    /*nb_float*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     (unaryfunc)gentype_oct,                      /*nb_oct*/
     (unaryfunc)gentype_hex,                      /*nb_hex*/
 #endif
     0,                                           /*inplace_add*/
     0,                                           /*inplace_subtract*/
     0,                                           /*inplace_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     0,                                           /*inplace_divide*/
 #endif
     0,                                           /*inplace_remainder*/
@@ -1156,6 +1152,10 @@ static PyNumberMethods gentype_as_number = {
     0,                                           /*nb_inplace_floor_divide*/
     0,                                           /*nb_inplace_true_divide*/
     (unaryfunc)NULL,                             /*nb_index*/
+#if PY_VERSION_HEX >= 0x03050000
+    0,                                           /*np_matmul*/
+    0,                                           /*np_inplace_matmul*/
+#endif
 };
 
 
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
new file mode 100644
index 000000000..5f5bd3506
--- /dev/null
+++ b/numpy/core/src/umath/matmul.c.src
@@ -0,0 +1,403 @@
+/* -*- c -*- */
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+
+#include "npy_config.h"
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_cblas.h"
+#include "arraytypes.h" /* For TYPE_dot functions */
+#include <assert.h>
+
+/*
+ *****************************************************************************
+ **                            BASICS                                       **
+ *****************************************************************************
+ */
+
+#if defined(HAVE_CBLAS)
+static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
+static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
+
+/**begin repeat
+ *
+ * #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+ * #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #type = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #prefix = s, d, c, z#
+ * #step1 = 1.F, 1., &oneF, &oneD#
+ * #step0 = 0.F, 0., &zeroF, &zeroD#
+ */
+NPY_NO_EXPORT void
+@name@_gemv(void *ip1, npy_intp is1_m, void *ip2, npy_intp is2_n, void *op,
+               npy_intp m, npy_intp n)
+{
+    /*
+     * Vector matrix multiplication -- Level 2 BLAS
+     * arguments
+     * ip1: contiguous data, m*n shape
+     * ip2: data in c order, n*1 shape
+     * op:  contiguous data in c order, m shape
+     */
+    enum CBLAS_ORDER order;
+    int lda;
+
+    if (is1_m == sizeof(@type@)) {
+        order = CblasColMajor;
+        lda = n;
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is1_m == sizeof(@type@) * m);
+        order = CblasRowMajor;
+        lda = m;
+    }
+    cblas_@prefix@gemv(order, CblasTrans, n, m, @step1@, ip1, lda, ip2,
+                                     is2_n / sizeof(@type@), @step0@, op, 1);
+}
+
+NPY_NO_EXPORT void
+@name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *op, npy_intp m, npy_intp n, npy_intp p)
+{
+    /*
+     * matrix matrix multiplication -- Level 3 BLAS
+     */
+    enum CBLAS_ORDER order = CblasRowMajor;
+    enum CBLAS_TRANSPOSE trans1, trans2;
+    int M, N, P, lda, ldb;
+    M = m;
+    N = n;
+    P = p;
+
+    if (is1_m == sizeof(@type@)) {
+        trans1 = CblasTrans;
+        lda = N > 1 ? is1_n / sizeof(@type@) : 1;
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is1_n == sizeof(@type@));
+        trans1 = CblasNoTrans;
+        lda = N > 1 ? is1_m / sizeof(@type@) : 1;
+    }
+    if (is2_n == sizeof(@type@)) {
+        trans2 = CblasTrans;
+        ldb = N > 1 ? is2_p / sizeof(@type@) : 1;
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is2_p == sizeof(@type@));
+        trans2 = CblasNoTrans;
+        ldb = P > 1 ? P : 1;
+    }
+    /*
+     * Use syrk if we have a case of a matrix times its transpose.
+     * Otherwise, use gemm for all other cases.
+     */
+    if (
+        (ip1 == ip2) &&
+        (m == p) &&
+        (is1_m == is2_p) &&
+        (is1_n == is2_n) &&
+        (trans1 != trans2)
+    ) {
+        npy_intp i,j;
+        if (trans1 == CblasNoTrans) {
+            cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
+                               ip1, lda, @step0@, op, P);
+        }
+        else {
+            cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
+                               ip1, ldb, @step0@, op, P);
+        }
+        /* Copy the triangle */
+        for (i = 0; i < P; i++) {
+            for (j = i + 1; j < P; j++) {
+                ((@type@*)op)[j * P + i] = ((@type@*)op)[i * P + j];
+            }
+        }
+
+    }
+    else {
+        cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda,
+                           ip2, ldb, @step0@, op, P);
+    }
+}
+
+/**end repeat**/
+#endif
+
+/*
+ * matmul loops
+ * signature is (m?,n),(n,p?)->(m?,p?)
+ */
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, HALF#
+ * #typ = npy_float,npy_double,npy_half#
+ * #SPECL = 0,0,1#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
+                           npy_intp dm, npy_intp dn, npy_intp dp,
+                           npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
+                           npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
+                           npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
+                           npy_intp os_p)
+{
+    npy_intp m, n, p;
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+            /*
+             * Use a double as an intermediate sum, which is natural for
+             * npy_double, slightly increases the accuracy of npy_float,
+             * and is perhaps overkill for npy_half.
+             */
+            double sum = 0;
+            for (n = 0; n < dn; n++) {
+#if @SPECL@ == 1
+                @typ@ val1 = (*(@typ@ *)ip1);
+                @typ@ val2 = (*(@typ@ *)ip2);
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#else
+                sum += ((double)*(@typ@ *)ip1) * ((double)*(@typ@ *)ip2);
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if @SPECL@ == 1
+            *(@typ@ *)op = npy_float_to_half((float)sum);
+#else
+            /* in the case of double -> float, may produce INF */
+            *(@typ@ *)op = (@typ@)sum;
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ *  #TYPE = LONGDOUBLE,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          BOOL#
+ *  #typ = npy_longdouble,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_bool#
+ * #IS_COMPLEX = 0, 1, 1, 1, 0*11#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
+                           npy_intp dm, npy_intp dn, npy_intp dp,
+                           npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
+                           npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
+                           npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
+                           npy_intp os_p)
+{
+    npy_intp m, n, p;
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if @IS_COMPLEX@ == 1
+            (*(@typ@ *)op).real = 0;
+            (*(@typ@ *)op).imag = 0;
+#else
+            *(@typ@ *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                @typ@ val1 = (*(@typ@ *)ip1);
+                @typ@ val2 = (*(@typ@ *)ip2);
+#if @IS_COMPLEX@ == 1
+                (*(@typ@ *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(@typ@ *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(@typ@ *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          BOOL#
+ *  #typ = npy_float,npy_double,npy_longdouble, npy_half,
+ *         npy_cfloat, npy_cdouble, npy_clongdouble,
+ *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ *         npy_bool#
+ * #SPECL = 0, 0, 0, 2, 1, 1, 1, 0*11#
+ * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*12#
+ * #chr = s, d, 0, 0, c, z, 0*12#
+ * #blas_typ = npy_float, npy_double, void*16#
+ */
+
+
+NPY_NO_EXPORT void
+@TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+#if @USEBLAS@ & defined(HAVE_CBLAS)
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool scalar_out = (dm ==1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > NPY_MAX_INT || dn > NPY_MAX_INT ||
+                                 dp >= NPY_MAX_INT);
+    npy_bool input_contiguous = ((is1_m == sizeof(@typ@) ||
+                                  is1_n == sizeof(@typ@)) &&
+                                 (is2_n == sizeof(@typ@) ||
+                                  is2_p == sizeof(@typ@)));
+    npy_bool vector_matrix = ((dm == 1) &&
+                   (is2_n == sizeof(@typ@) || (is2_n == sizeof(@typ@) * dp)));
+    npy_bool matrix_vector = ((dp == 1)  &&
+                   (is1_n == sizeof(@typ@) || (is1_n == sizeof(@typ@) * dm)));
+#endif
+
+    ib1_n = is1_n*dn;
+    ib2_n = is2_n*dn;
+    ib2_p = is2_p*dp;
+    ob_p  = os_p *dp;
+
+    if (dn == 0) {
+        /* No operand, need to zero the output */
+        for (iOuter = 0; iOuter < dOuter; iOuter++,
+                        args[0] += s0, args[1] += s1, args[2] += s2) {
+            npy_intp m, p;
+            char *op=args[2];
+            for (m = 0; m < dm; m++) {
+                for (p = 0; p < dp; p++) {
+#if @SPECL@ == 1
+                    (*(@typ@ *)op).real = 0;
+                    (*(@typ@ *)op).imag = 0;
+#else
+                    *(@typ@ *)op = 0;
+#endif
+                    op  +=  os_p;
+                }
+                op  +=  os_m - ob_p;
+            }
+        }
+        return;
+    }
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+#if @USEBLAS@ & defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas) {
+            @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+                      ib1_n, ib2_n, ib2_p, ob_p,
+                      is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 0d @ vector or vector @ 0d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+                          ib1_n, ib2_n, ib2_p, ob_p,
+                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                @TYPE@_gemv((void*)ip2, is2_n, (void*)ip1, is1_n,
+                            (void*)op, dp, dn);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                @TYPE@_gemv((void*)ip1, is1_n, (void*)ip2, is2_n,
+                            (void*)op, dm, dn);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-contiguous input */
+                @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+                          ib1_n, ib2_n, ib2_p, ob_p,
+                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (input_contiguous) {
+                /* can only use blas if input is contiguous */
+                @TYPE@_matmul_matrixmatrix((void*)ip1, is1_m, is1_n,
+                                           (void*)ip2, is2_n, is2_p,
+                                           (void*)op, dm, dn, dp);
+            } else {
+                @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+                          ib1_n, ib2_n, ib2_p, ob_p,
+                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+            }
+        }
+#else
+        @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+                          ib1_n, ib2_n, ib2_p, ob_p,
+                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+
+#endif
+    }
+}
+
+/**end repeat**/
+
+
diff --git a/numpy/core/src/umath/matmul.h.src b/numpy/core/src/umath/matmul.h.src
new file mode 100644
index 000000000..16be7675b
--- /dev/null
+++ b/numpy/core/src/umath/matmul.h.src
@@ -0,0 +1,12 @@
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
+ *          CFLOAT, CDOUBLE, CLONGDOUBLE,
+ *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ *          BYTE, SHORT, INT, LONG, LONGLONG,
+ *          BOOL#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index e98d9f865..a7987acda 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -1564,7 +1564,6 @@ static PyObject*
 }
 /**end repeat**/
 
-
 /**begin repeat
  *  #name = byte, ubyte, short, ushort, int, uint,
  *          long, ulong, longlong, ulonglong,
@@ -1575,8 +1574,7 @@ static PyNumberMethods @name@_as_number = {
     (binaryfunc)@name@_add,                     /*nb_add*/
     (binaryfunc)@name@_subtract,                /*nb_subtract*/
     (binaryfunc)@name@_multiply,                /*nb_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     (binaryfunc)@name@_divide,                  /*nb_divide*/
 #endif
     (binaryfunc)@name@_remainder,               /*nb_remainder*/
@@ -1596,8 +1594,7 @@ static PyNumberMethods @name@_as_number = {
     (binaryfunc)@name@_and,                     /*nb_and*/
     (binaryfunc)@name@_xor,                     /*nb_xor*/
     (binaryfunc)@name@_or,                      /*nb_or*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     0,                                          /*nb_coerce*/
 #endif
     (unaryfunc)@name@_int,                      /*nb_int*/
@@ -1607,16 +1604,14 @@ static PyNumberMethods @name@_as_number = {
     (unaryfunc)@name@_long,                     /*nb_long*/
 #endif
     (unaryfunc)@name@_float,                    /*nb_float*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     (unaryfunc)@name@_oct,                      /*nb_oct*/
     (unaryfunc)@name@_hex,                      /*nb_hex*/
 #endif
     0,                                          /*inplace_add*/
     0,                                          /*inplace_subtract*/
     0,                                          /*inplace_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
     0,                                          /*inplace_divide*/
 #endif
     0,                                          /*inplace_remainder*/
@@ -1631,6 +1626,10 @@ static PyNumberMethods @name@_as_number = {
     0,                                          /*nb_inplace_floor_divide*/
     0,                                          /*nb_inplace_true_divide*/
     (unaryfunc)NULL,                            /*nb_index*/
+#if PY_VERSION_HEX >= 0x03050000
+    0,                                          /*nb_matrix_multiply*/
+    0,                                          /*nb_inplace_matrix_multiply*/
+#endif
 };
 /**end repeat**/
 
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 1fe8745a0..66f512f7b 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2845,13 +2845,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     }
 
     /* Fill in any allocated outputs */
-    for (i = nin; i < nop; ++i) {
-        if (op[i] == NULL) {
-            op[i] = NpyIter_GetOperandArray(iter)[i];
-            Py_INCREF(op[i]);
+    {
+        PyArrayObject **operands = NpyIter_GetOperandArray(iter);
+        for (i = 0; i < nop; ++i) {
+            if (op[i] == NULL) {
+                op[i] = operands[i];
+                Py_INCREF(op[i]);
+            }
         }
     }
-
     /*
      * Set up the inner strides array. Because we're not doing
      * buffering, the strides are fixed throughout the looping.
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 6b042d837..efd923972 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -22,6 +22,11 @@
 #include "ufunc_object.h"
 #include "common.h"
 
+#include "mem_overlap.h"
+#if defined(HAVE_CBLAS)
+#include "cblasfuncs.h"
+#endif
+
 static const char *
 npy_casting_to_string(NPY_CASTING casting)
 {
@@ -1299,6 +1304,88 @@ PyUFunc_MixedDivisionTypeResolver(PyUFuncObject *ufunc,
                                         type_tup, out_dtypes);
 }
 
+/*
+ * XXX This is too restrictive, we should only check the inner axis involved
+ * gh-12365
+ */
+#define NOT_CONTIGUOUS(a) (PyArray_NDIM(a) >=2 && !( \
+        PyArray_IS_C_CONTIGUOUS(a) || PyArray_IS_F_CONTIGUOUS(a)))
+
+/*
+ * This function applies special type resolution rules for the case
+ * where all the functions have the pattern XX->X, using
+ * PyArray_ResultType instead of a linear search to get the best
+ * loop, like PyUFunc_SimpleBinaryOperationTypeResolver, and adds
+ * memory overlap and contiguity considerations to the operands,
+ * possibly creating Writeback temporary data
+ *
+ * Returns 0 on success, -1 on error.
+ */
+NPY_NO_EXPORT int
+PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
+                           NPY_CASTING casting,
+                           PyArrayObject **operands,
+                           PyObject *type_tup,
+                           PyArray_Descr **out_dtypes)
+{
+
+    if (PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting, operands,
+            type_tup, out_dtypes) < 0) {
+        return -1;
+    }
+    if (PyArray_NDIM(operands[0]) >= 2 || PyArray_NDIM(operands[1]) >= 2) {
+#if defined(HAVE_CBLAS)
+        int typenum =  out_dtypes[2]->type_num;
+        if (  (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+                NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+            /*
+             * We are going to use BLAS
+             * make sure 2d and more arrays are contiguous
+             */
+            if (_bad_strides(operands[0]) || NOT_CONTIGUOUS(operands[0])) {
+                    PyObject *op = PyArray_NewCopy(operands[0], NPY_ANYORDER);
+
+                    if (op == NULL) {
+                       return -1;
+                    }
+                    Py_DECREF(operands[0]);
+                    operands[0] = (PyArrayObject *)op;
+            }
+            if (_bad_strides(operands[1]) || NOT_CONTIGUOUS(operands[1])) {
+                    PyObject *op = PyArray_NewCopy(operands[1], NPY_ANYORDER);
+
+                    if (op == NULL) {
+                        return -1;
+                    }
+                    Py_DECREF(operands[1]);
+                    operands[1] = (PyArrayObject *)op;
+            }
+        }
+#endif
+        if (operands[2] != NULL) {
+            npy_intp last_stride = PyArray_STRIDE(operands[2],
+                                                  PyArray_NDIM(operands[2]) - 1);
+            if (last_stride != PyArray_ITEMSIZE(operands[2])) {
+                PyErr_SetString(PyExc_ValueError,
+                    "output array is not acceptable (must be C-contiguous)");
+                return -1;
+            }
+            /*
+             * Use all the flags in PyUFunc_GeneralizedFunction
+             * except NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE and
+             * NPY_ITER_WRITEONLY. Without NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
+             * nditer will properly check overlap between output and any input.
+             */
+            ufunc->op_flags[2] = NPY_ITER_WRITEONLY |
+                      NPY_ITER_UPDATEIFCOPY |
+                      NPY_ITER_ALIGNED |
+                      NPY_ITER_ALLOCATE |
+                      NPY_ITER_NO_BROADCAST;
+        }
+    }
+    return 0;
+}
+
 
 static int
 find_userloop(PyUFuncObject *ufunc,
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index bb4823d24..5306a5983 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -145,5 +145,11 @@ PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
                                       NpyAuxData **out_innerloopdata,
                                       int *out_needs_api);
 
+NPY_NO_EXPORT int
+PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
+                              NPY_CASTING casting,
+                              PyArrayObject **operands,
+                              PyObject *type_tup,
+                              PyArray_Descr **out_dtypes);
 
 #endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 51fe6e9ef..80c69d9b3 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2595,7 +2595,8 @@ class TestMethods(object):
         assert_equal(x1.flatten('F'), y1f)
         assert_equal(x1.flatten('F'), x1.T.flatten())
 
-    def test_dot(self):
+    @pytest.mark.parametrize('func', (np.dot, np.matmul))
+    def test_arr_mult(self, func):
         a = np.array([[1, 0], [0, 1]])
         b = np.array([[0, 1], [1, 0]])
         c = np.array([[9, 1], [1, -9]])
@@ -2619,49 +2620,49 @@ class TestMethods(object):
         # gemm vs syrk optimizations
         for et in [np.float32, np.float64, np.complex64, np.complex128]:
             eaf = a.astype(et)
-            assert_equal(np.dot(eaf, eaf), eaf)
-            assert_equal(np.dot(eaf.T, eaf), eaf)
-            assert_equal(np.dot(eaf, eaf.T), eaf)
-            assert_equal(np.dot(eaf.T, eaf.T), eaf)
-            assert_equal(np.dot(eaf.T.copy(), eaf), eaf)
-            assert_equal(np.dot(eaf, eaf.T.copy()), eaf)
-            assert_equal(np.dot(eaf.T.copy(), eaf.T.copy()), eaf)
+            assert_equal(func(eaf, eaf), eaf)
+            assert_equal(func(eaf.T, eaf), eaf)
+            assert_equal(func(eaf, eaf.T), eaf)
+            assert_equal(func(eaf.T, eaf.T), eaf)
+            assert_equal(func(eaf.T.copy(), eaf), eaf)
+            assert_equal(func(eaf, eaf.T.copy()), eaf)
+            assert_equal(func(eaf.T.copy(), eaf.T.copy()), eaf)
 
         # syrk validations
         for et in [np.float32, np.float64, np.complex64, np.complex128]:
             eaf = a.astype(et)
             ebf = b.astype(et)
-            assert_equal(np.dot(ebf, ebf), eaf)
-            assert_equal(np.dot(ebf.T, ebf), eaf)
-            assert_equal(np.dot(ebf, ebf.T), eaf)
-            assert_equal(np.dot(ebf.T, ebf.T), eaf)
+            assert_equal(func(ebf, ebf), eaf)
+            assert_equal(func(ebf.T, ebf), eaf)
+            assert_equal(func(ebf, ebf.T), eaf)
+            assert_equal(func(ebf.T, ebf.T), eaf)
 
         # syrk - different shape, stride, and view validations
         for et in [np.float32, np.float64, np.complex64, np.complex128]:
             edf = d.astype(et)
             assert_equal(
-                np.dot(edf[::-1, :], edf.T),
-                np.dot(edf[::-1, :].copy(), edf.T.copy())
+                func(edf[::-1, :], edf.T),
+                func(edf[::-1, :].copy(), edf.T.copy())
             )
             assert_equal(
-                np.dot(edf[:, ::-1], edf.T),
-                np.dot(edf[:, ::-1].copy(), edf.T.copy())
+                func(edf[:, ::-1], edf.T),
+                func(edf[:, ::-1].copy(), edf.T.copy())
             )
             assert_equal(
-                np.dot(edf, edf[::-1, :].T),
-                np.dot(edf, edf[::-1, :].T.copy())
+                func(edf, edf[::-1, :].T),
+                func(edf, edf[::-1, :].T.copy())
             )
             assert_equal(
-                np.dot(edf, edf[:, ::-1].T),
-                np.dot(edf, edf[:, ::-1].T.copy())
+                func(edf, edf[:, ::-1].T),
+                func(edf, edf[:, ::-1].T.copy())
             )
             assert_equal(
-                np.dot(edf[:edf.shape[0] // 2, :], edf[::2, :].T),
-                np.dot(edf[:edf.shape[0] // 2, :].copy(), edf[::2, :].T.copy())
+                func(edf[:edf.shape[0] // 2, :], edf[::2, :].T),
+                func(edf[:edf.shape[0] // 2, :].copy(), edf[::2, :].T.copy())
             )
             assert_equal(
-                np.dot(edf[::2, :], edf[:edf.shape[0] // 2, :].T),
-                np.dot(edf[::2, :].copy(), edf[:edf.shape[0] // 2, :].T.copy())
+                func(edf[::2, :], edf[:edf.shape[0] // 2, :].T),
+                func(edf[::2, :].copy(), edf[:edf.shape[0] // 2, :].T.copy())
             )
 
         # syrk - different shape
@@ -2669,9 +2670,13 @@ class TestMethods(object):
             edf = d.astype(et)
             eddtf = ddt.astype(et)
             edtdf = dtd.astype(et)
-            assert_equal(np.dot(edf, edf.T), eddtf)
-            assert_equal(np.dot(edf.T, edf), edtdf)
+            assert_equal(func(edf, edf.T), eddtf)
+            assert_equal(func(edf.T, edf), edtdf)
 
+    def test_dot(self):
+        a = np.array([[1, 0], [0, 1]])
+        b = np.array([[0, 1], [1, 0]])
+        c = np.array([[9, 1], [1, -9]])
         # function versus methods
         assert_equal(np.dot(a, b), a.dot(b))
         assert_equal(np.dot(np.dot(a, b), c), a.dot(b).dot(c))
@@ -2738,6 +2743,18 @@ class TestMethods(object):
         assert_raises(NotImplementedError, np.matmul, A(), A())
         assert_raises(NotImplementedError, np.inner, A(), A())
 
+    def test_matmul_out(self):
+        # overlapping memory
+        a = np.arange(18).reshape(2, 3, 3)
+        b = np.matmul(a, a)
+        c = np.matmul(a, a, out=a)
+        assert_(c is a)
+        assert_equal(c, b)
+        a = np.arange(18).reshape(2, 3, 3)
+        c = np.matmul(a, a, out=a[::-1, ...])
+        assert_(c.base is a.base)
+        assert_equal(c, b)
+
     def test_diagonal(self):
         a = np.arange(12).reshape((3, 4))
         assert_equal(a.diagonal(), [0, 5, 10])
@@ -3147,6 +3164,8 @@ class TestBinop(object):
             # 'eq':       (np.equal, False),
             # 'ne':       (np.not_equal, False),
         }
+        if sys.version_info >= (3, 5):
+            ops['matmul'] = (np.matmul, False, float)
 
         class Coerced(Exception):
             pass
@@ -3189,7 +3208,7 @@ class TestBinop(object):
             if issubclass(MyType, np.ndarray):
                 # Use this range to avoid special case weirdnesses around
                 # divide-by-0, pow(x, 2), overflow due to pow(big, big), etc.
-                return np.arange(3, 5).view(MyType)
+                return np.arange(3, 7).reshape(2, 2).view(MyType)
             else:
                 return MyType()
 
@@ -3198,7 +3217,7 @@ class TestBinop(object):
             for op, (ufunc, has_inplace, dtype) in ops.items():
                 err_msg = ('op: %s, ufunc: %s, has_inplace: %s, dtype: %s'
                            % (op, ufunc, has_inplace, dtype))
-                check_objs = [np.arange(3, 5, dtype=dtype)]
+                check_objs = [np.arange(3, 7, dtype=dtype).reshape(2, 2)]
                 if check_scalar:
                     check_objs.append(check_objs[0][0])
                 for arr in check_objs:
@@ -5697,13 +5716,36 @@ class MatmulCommon(object):
             res = self.matmul(v, v)
             assert_(type(res) is np.dtype(dt).type)
 
-    def test_vector_vector_values(self):
-        vec = np.array([1, 2])
-        tgt = 5
+    def test_0d_vector_values(self):
+        vec1 = np.array([2])
+        vec2 = np.array([3, 4]).reshape(1, -1)
+        tgt = np.array([6, 8])
         for dt in self.types[1:]:
-            v1 = vec.astype(dt)
-            res = self.matmul(v1, v1)
+            v1 = vec1.astype(dt)
+            v2 = vec2.astype(dt)
+            res = self.matmul(v1, v2)
             assert_equal(res, tgt)
+            res = self.matmul(v2.T, v1)
+            assert_equal(res, tgt)
+
+        # boolean type
+        vec = np.array([True, True], dtype='?').reshape(1, -1)
+        res = self.matmul(vec[:, 0], vec)
+        assert_equal(res, True)
+
+    def test_vector_vector_values(self):
+        vec1 = np.array([1, 2])
+        vec2 = np.array([3, 4]).reshape(-1, 1)
+        tgt1 = np.array([11])
+        tgt2 = np.array([[3, 6], [4, 8]])
+        for dt in self.types[1:]:
+            v1 = vec1.astype(dt)
+            v2 = vec2.astype(dt)
+            res = self.matmul(v1, v2)
+            assert_equal(res, tgt1)
+            # no broadcast, we must make v1 into a 2d ndarray
+            res = self.matmul(v2, v1.reshape(1, -1))
+            assert_equal(res, tgt2)
 
         # boolean type
         vec = np.array([True, True], dtype='?')
@@ -5851,27 +5893,52 @@ class TestMatmul(MatmulCommon):
         assert_array_equal(out, tgt, err_msg=msg)
 
         # test out with not allowed type cast (safe casting)
-        # einsum and cblas raise different error types, so
-        # use Exception.
-        msg = "out argument with illegal cast"
+        msg = "Cannot cast ufunc matmul output"
         out = np.zeros((2, 2), dtype=np.int32)
-        assert_raises(Exception, self.matmul, a, b, out=out)
+        assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out)
 
-        # skip following tests for now, cblas does not allow non-contiguous
+        # cblas does not allow non-contiguous
         # outputs and consistency with dot would require same type,
         # dimensions, subtype, and c_contiguous.
 
-        # test out with allowed type cast
-        # msg = "out argument with allowed cast"
-        # out = np.zeros((2, 2), dtype=np.complex128)
-        # self.matmul(a, b, out=out)
-        # assert_array_equal(out, tgt, err_msg=msg)
+        # test out with type upcast to complex
+        out = np.zeros((2, 2), dtype=np.complex128)
+        c = self.matmul(a, b, out=out)
+        assert_(c is out)
+        with suppress_warnings() as sup:
+            sup.filter(np.ComplexWarning, '')
+            c = c.astype(tgt.dtype)
+        assert_array_equal(c, tgt)
 
         # test out non-contiguous
-        # msg = "out argument with non-contiguous layout"
-        # c = np.zeros((2, 2, 2), dtype=float)
-        # self.matmul(a, b, out=c[..., 0])
-        # assert_array_equal(c, tgt, err_msg=msg)
+        out = np.ones((2, 2, 2), dtype=float)
+        assert_raises(ValueError, self.matmul, a, b, out=out[..., 0])
+
+    m1 = np.arange(15.).reshape(5, 3)
+    m2 = np.arange(21.).reshape(3, 7)
+    m3 = np.arange(30.).reshape(5, 6)[:, ::2]  # non-contiguous
+    vc = np.arange(10.)
+    vr = np.arange(6.)
+    @pytest.mark.parametrize('args', (
+            # matrix-matrix
+            (m1, m2), (m2.T, m1.T), (m2.T.copy(), m1.T), (m2.T, m1.T.copy()),
+            # matrix-matrix-transpose, contiguous and non
+            (m1, m1.T), (m1.T, m1), (m1, m3.T), (m3, m1.T),
+            (m3, m3.T), (m3.T, m3),
+            # matrix-matrix non-contiguous
+            (m3, m2), (m2.T, m3.T), (m2.T.copy(), m3.T),
+            # vector-matrix, matrix-vector, contiguous
+            (m1, vr[:3]), (vc[:5], m1), (m1.T, vc[:5]), (vr[:3], m1.T),
+            # vector-matrix, matrix-vector, vector non-contiguous
+            (m1, vr[::2]), (vc[::2], m1), (m1.T, vc[::2]), (vr[::2], m1.T),
+            # vector-matrix, matrix-vector, matrix non-contiguous
+            (m3, vr[:3]), (vc[:5], m3), (m3.T, vc[:5]), (vr[:3], m3.T),
+            # vector-matrix, matrix-vector, both non-contiguous
+            (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T)))
+    def test_dot_equivalent(self, args):
+        r1 = np.matmul(*args)
+        r2 = np.dot(*args)
+        assert_equal(r1, r2)
 
 
 if sys.version_info[:2] >= (3, 5):
@@ -5895,6 +5962,11 @@ if sys.version_info[:2] >= (3, 5):
             assert_equal(self.matmul(a, b), "A")
             assert_equal(self.matmul(b, a), "A")
 
+        def test_matmul_raises(self):
+            assert_raises(TypeError, self.matmul, np.int8(5), np.int8(5))
+            assert_raises(TypeError, self.matmul, np.void(b'abc'), np.void(b'abc'))
+            assert_raises(ValueError, self.matmul, np.arange(10), np.void(b'abc'))
+
     def test_matmul_inplace():
         # It would be nice to support in-place matmul eventually, but for now
         # we don't have a working implementation, so better just to error out
@@ -5909,6 +5981,17 @@ if sys.version_info[:2] >= (3, 5):
         exec_ = getattr(builtins, "exec")
         assert_raises(TypeError, exec_, "a @= b", globals(), locals())
 
+    def test_matmul_axes():
+        a = np.arange(3*4*5).reshape(3, 4, 5)
+        c = np.matmul(a, a, axes=[(-2, -1), (-1, -2), (1, 2)])
+        assert c.shape == (3, 4, 4)
+        d = np.matmul(a, a, axes=[(-2, -1), (-1, -2), (0, 1)])
+        assert d.shape == (4, 4, 3)
+        e = np.swapaxes(d, 0, 2)
+        assert_array_equal(e, c)
+        f = np.matmul(a, np.arange(3), axes=[(1, 0), (0), (0)])
+        assert f.shape == (4, 5)
+
 
 class TestInner(object):