summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/_add_newdocs.py123
-rw-r--r--numpy/core/code_generators/generate_umath.py21
-rw-r--r--numpy/core/code_generators/ufunc_docstrings.py125
-rw-r--r--numpy/core/multiarray.py1
-rw-r--r--numpy/core/setup.py3
-rw-r--r--numpy/core/src/common/cblasfuncs.c2
-rw-r--r--numpy/core/src/common/cblasfuncs.h2
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c155
-rw-r--r--numpy/core/src/multiarray/number.c10
-rw-r--r--numpy/core/src/multiarray/number.h1
-rw-r--r--numpy/core/src/multiarray/scalartypes.c.src16
-rw-r--r--numpy/core/src/umath/matmul.c.src403
-rw-r--r--numpy/core/src/umath/matmul.h.src12
-rw-r--r--numpy/core/src/umath/scalarmath.c.src17
-rw-r--r--numpy/core/src/umath/ufunc_object.c12
-rw-r--r--numpy/core/src/umath/ufunc_type_resolution.c87
-rw-r--r--numpy/core/src/umath/ufunc_type_resolution.h6
-rw-r--r--numpy/core/tests/test_multiarray.py177
18 files changed, 812 insertions, 361 deletions
diff --git a/numpy/core/_add_newdocs.py b/numpy/core/_add_newdocs.py
index bc034c3e9..668aee935 100644
--- a/numpy/core/_add_newdocs.py
+++ b/numpy/core/_add_newdocs.py
@@ -1513,129 +1513,6 @@ add_newdoc('numpy.core.multiarray', 'getbuffer',
""")
-add_newdoc('numpy.core', 'matmul',
- """
- matmul(a, b, out=None)
-
- Matrix product of two arrays.
-
- The behavior depends on the arguments in the following way.
-
- - If both arguments are 2-D they are multiplied like conventional
- matrices.
- - If either argument is N-D, N > 2, it is treated as a stack of
- matrices residing in the last two indexes and broadcast accordingly.
- - If the first argument is 1-D, it is promoted to a matrix by
- prepending a 1 to its dimensions. After matrix multiplication
- the prepended 1 is removed.
- - If the second argument is 1-D, it is promoted to a matrix by
- appending a 1 to its dimensions. After matrix multiplication
- the appended 1 is removed.
-
- Multiplication by a scalar is not allowed, use ``*`` instead. Note that
- multiplying a stack of matrices with a vector will result in a stack of
- vectors, but matmul will not recognize it as such.
-
- ``matmul`` differs from ``dot`` in two important ways.
-
- - Multiplication by scalars is not allowed.
- - Stacks of matrices are broadcast together as if the matrices
- were elements.
-
- .. warning::
- This function is preliminary and included in NumPy 1.10.0 for testing
- and documentation. Its semantics will not change, but the number and
- order of the optional arguments will.
-
- .. versionadded:: 1.10.0
-
- Parameters
- ----------
- a : array_like
- First argument.
- b : array_like
- Second argument.
- out : ndarray, optional
- Output argument. This must have the exact kind that would be returned
- if it was not used. In particular, it must have the right type, must be
- C-contiguous, and its dtype must be the dtype that would be returned
- for `dot(a,b)`. This is a performance feature. Therefore, if these
- conditions are not met, an exception is raised, instead of attempting
- to be flexible.
-
- Returns
- -------
- output : ndarray
- Returns the dot product of `a` and `b`. If `a` and `b` are both
- 1-D arrays then a scalar is returned; otherwise an array is
- returned. If `out` is given, then it is returned.
-
- Raises
- ------
- ValueError
- If the last dimension of `a` is not the same size as
- the second-to-last dimension of `b`.
-
- If scalar value is passed.
-
- See Also
- --------
- vdot : Complex-conjugating dot product.
- tensordot : Sum products over arbitrary axes.
- einsum : Einstein summation convention.
- dot : alternative matrix product with different broadcasting rules.
-
- Notes
- -----
- The matmul function implements the semantics of the `@` operator introduced
- in Python 3.5 following PEP465.
-
- Examples
- --------
- For 2-D arrays it is the matrix product:
-
- >>> a = [[1, 0], [0, 1]]
- >>> b = [[4, 1], [2, 2]]
- >>> np.matmul(a, b)
- array([[4, 1],
- [2, 2]])
-
- For 2-D mixed with 1-D, the result is the usual.
-
- >>> a = [[1, 0], [0, 1]]
- >>> b = [1, 2]
- >>> np.matmul(a, b)
- array([1, 2])
- >>> np.matmul(b, a)
- array([1, 2])
-
-
- Broadcasting is conventional for stacks of arrays
-
- >>> a = np.arange(2*2*4).reshape((2,2,4))
- >>> b = np.arange(2*2*4).reshape((2,4,2))
- >>> np.matmul(a,b).shape
- (2, 2, 2)
- >>> np.matmul(a,b)[0,1,1]
- 98
- >>> sum(a[0,1,:] * b[0,:,1])
- 98
-
- Vector, vector returns the scalar inner product, but neither argument
- is complex-conjugated:
-
- >>> np.matmul([2j, 3j], [2j, 3j])
- (-13+0j)
-
- Scalar multiplication raises an error.
-
- >>> np.matmul([1,2], 3)
- Traceback (most recent call last):
- ...
- ValueError: Scalar operands are not allowed, use '*' instead
-
- """)
-
add_newdoc('numpy.core.multiarray', 'c_einsum',
"""
c_einsum(subscripts, *operands, out=None, dtype=None, order='K',
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 9d4e72c0e..a092cb95e 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -127,7 +127,7 @@ class Ufunc(object):
type_descriptions : list of TypeDescription objects
"""
def __init__(self, nin, nout, identity, docstring, typereso,
- *type_descriptions):
+ *type_descriptions, **kwargs):
self.nin = nin
self.nout = nout
if identity is None:
@@ -136,10 +136,13 @@ class Ufunc(object):
self.docstring = docstring
self.typereso = typereso
self.type_descriptions = []
+ self.signature = kwargs.pop('signature', 'NULL')
for td in type_descriptions:
self.type_descriptions.extend(td)
for td in self.type_descriptions:
td.finish_signature(self.nin, self.nout)
+ if kwargs:
+ raise ValueError('unknown kwargs %r' % str(kwargs))
# String-handling utilities to avoid locale-dependence.
@@ -904,7 +907,14 @@ defdict = {
"PyUFunc_SimpleBinaryOperationTypeResolver",
TD(ints),
TD('O', f='npy_ObjectLCM'),
- )
+ ),
+'matmul' :
+ Ufunc(2, 1, None,
+ docstrings.get('numpy.core.umath.matmul'),
+ "PyUFunc_MatmulTypeResolver",
+ TD(notimes_or_obj),
+ signature='"(n?,k),(k,m?)->(n?,m?)"',
+ ),
}
if sys.version_info[0] >= 3:
@@ -1058,7 +1068,7 @@ def make_ufuncs(funcdict):
f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
{name}_functions, {name}_data, {name}_signatures, {nloops},
{nin}, {nout}, {identity}, "{name}",
- "{doc}", 0, NULL, identity
+ "{doc}", 0, {signature}, identity
);
if ({has_identity}) {{
Py_DECREF(identity);
@@ -1073,7 +1083,8 @@ def make_ufuncs(funcdict):
has_identity='0' if uf.identity is None_ else '1',
identity='PyUFunc_IdentityValue',
identity_expr=uf.identity,
- doc=docstring
+ doc=docstring,
+ signature=uf.signature,
)
# Only PyUFunc_None means don't reorder - we pass this using the old
@@ -1106,6 +1117,8 @@ def make_code(funcdict, filename):
#include "cpuid.h"
#include "ufunc_object.h"
#include "ufunc_type_resolution.h"
+ #include "loops.h"
+ #include "matmul.h"
%s
static int
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 13231de29..d79702a18 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -39,7 +39,8 @@ subst = {
def add_newdoc(place, name, doc):
doc = textwrap.dedent(doc).strip()
- if name[0] != '_':
+ if name[0] != '_' and name != 'matmul':
+ # matmul is special, it does not use the OUT_SCALAR replacement strings
if '\nx :' in doc:
assert '$OUT_SCALAR_1' in doc, "in {}".format(name)
elif '\nx2 :' in doc or '\nx1, x2 :' in doc:
@@ -2541,6 +2542,128 @@ add_newdoc('numpy.core.umath', 'fmin',
""")
+add_newdoc('numpy.core.umath', 'matmul',
+ """
+ Matrix product of two arrays.
+
+ Parameters
+ ----------
+ x1, x2 : array_like
+ Input arrays, scalars not allowed.
+ out : ndarray, optional
+ A location into which the result is stored. If provided, it must have
+ a shape that matches the signature `(n,k),(k,m)->(n,m)`. If not
+ provided or `None`, a freshly-allocated array is returned.
+ **kwargs
+ For other keyword-only arguments, see the
+ :ref:`ufunc docs <ufuncs.kwargs>`.
+
+ ..versionchanged:: 1.16
+ Now handles ufunc kwargs
+
+ Returns
+ -------
+ y : ndarray
+ The matrix product of the inputs.
+ This is a scalar only when both x1, x2 are 1-d vectors.
+
+ Raises
+ ------
+ ValueError
+ If the last dimension of `a` is not the same size as
+ the second-to-last dimension of `b`.
+
+ If a scalar value is passed in.
+
+ See Also
+ --------
+ vdot : Complex-conjugating dot product.
+ tensordot : Sum products over arbitrary axes.
+ einsum : Einstein summation convention.
+ dot : alternative matrix product with different broadcasting rules.
+
+ Notes
+ -----
+
+ The behavior depends on the arguments in the following way.
+
+ - If both arguments are 2-D they are multiplied like conventional
+ matrices.
+ - If either argument is N-D, N > 2, it is treated as a stack of
+ matrices residing in the last two indexes and broadcast accordingly.
+ - If the first argument is 1-D, it is promoted to a matrix by
+ prepending a 1 to its dimensions. After matrix multiplication
+ the prepended 1 is removed.
+ - If the second argument is 1-D, it is promoted to a matrix by
+ appending a 1 to its dimensions. After matrix multiplication
+ the appended 1 is removed.
+
+ ``matmul`` differs from ``dot`` in two important ways.
+
+ - Multiplication by scalars is not allowed, use ``*`` instead.
+ - Stacks of matrices are broadcast together as if the matrices
+ were elements, respecting the signature `(n,k),(k,m)->(n,m)`:
+
+ >>> a = a = np.full([9,5,7,3], True, dtype=bool)
+ >>> c = np.full([9, 5, 4,3], True, dtype=bool)
+ >>> np.dot(a, c).shape
+ (9, 5, 7, 9, 5, 4)
+ >>> np.matmul(a, c).shape # n is 5, k is 3, m is 4
+ (9, 5, 7, 4)
+
+ The matmul function implements the semantics of the `@` operator introduced
+ in Python 3.5 following PEP465.
+
+ Examples
+ --------
+ For 2-D arrays it is the matrix product:
+
+ >>> a = np.array([[1, 0],
+ ... [0, 1]])
+ >>> b = np.array([[4, 1],
+ ... [2, 2]]
+ >>> np.matmul(a, b)
+ array([[4, 1],
+ [2, 2]])
+
+ For 2-D mixed with 1-D, the result is the usual.
+
+ >>> a = np.array([[1, 0],
+ ... [0, 1]]
+ >>> b = np.array([1, 2])
+ >>> np.matmul(a, b)
+ array([1, 2])
+ >>> np.matmul(b, a)
+ array([1, 2])
+
+
+ Broadcasting is conventional for stacks of arrays
+
+ >>> a = np.arange(2*2*4).reshape((2,2,4))
+ >>> b = np.arange(2*2*4).reshape((2,4,2))
+ >>> np.matmul(a,b).shape
+ (2, 2, 2)
+ >>> np.matmul(a,b)[0,1,1]
+ 98
+ >>> sum(a[0,1,:] * b[0,:,1])
+ 98
+
+ Vector, vector returns the scalar inner product, but neither argument
+ is complex-conjugated:
+
+ >>> np.matmul([2j, 3j], [2j, 3j])
+ (-13+0j)
+
+ Scalar multiplication raises an error.
+
+ >>> np.matmul([1,2], 3)
+ Traceback (most recent call last):
+ ...
+ ValueError: Scalar operands are not allowed, use '*' instead
+
+ .. versionadded:: 1.10.0
+ """)
+
add_newdoc('numpy.core.umath', 'modf',
"""
Return the fractional and integral parts of an array, element-wise.
diff --git a/numpy/core/multiarray.py b/numpy/core/multiarray.py
index 78963b0aa..d4d5922cd 100644
--- a/numpy/core/multiarray.py
+++ b/numpy/core/multiarray.py
@@ -51,7 +51,6 @@ fromiter.__module__ = 'numpy'
frompyfunc.__module__ = 'numpy'
fromstring.__module__ = 'numpy'
geterrobj.__module__ = 'numpy'
-matmul.__module__ = 'numpy'
may_share_memory.__module__ = 'numpy'
nested_iters.__module__ = 'numpy'
promote_types.__module__ = 'numpy'
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 23a9e268b..467b590ac 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -732,6 +732,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'common', 'cblasfuncs.h'),
join('src', 'common', 'lowlevel_strided_loops.h'),
join('src', 'common', 'mem_overlap.h'),
+ join('src', 'common', 'npy_cblas.h'),
join('src', 'common', 'npy_config.h'),
join('src', 'common', 'npy_ctypes.h'),
join('src', 'common', 'npy_extint128.h'),
@@ -892,6 +893,8 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'simd.inc.src'),
join('src', 'umath', 'loops.h.src'),
join('src', 'umath', 'loops.c.src'),
+ join('src', 'umath', 'matmul.h.src'),
+ join('src', 'umath', 'matmul.c.src'),
join('src', 'umath', 'ufunc_object.c'),
join('src', 'umath', 'extobj.c'),
join('src', 'umath', 'cpuid.c'),
diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c
index 6460c5db1..514297940 100644
--- a/numpy/core/src/common/cblasfuncs.c
+++ b/numpy/core/src/common/cblasfuncs.c
@@ -182,7 +182,7 @@ _select_matrix_shape(PyArrayObject *array)
* This also makes sure that the data segment is aligned with
* an itemsize address as well by returning one if not true.
*/
-static int
+NPY_NO_EXPORT int
_bad_strides(PyArrayObject *ap)
{
int itemsize = PyArray_ITEMSIZE(ap);
diff --git a/numpy/core/src/common/cblasfuncs.h b/numpy/core/src/common/cblasfuncs.h
index 66ce4ca5b..78eff25a0 100644
--- a/numpy/core/src/common/cblasfuncs.h
+++ b/numpy/core/src/common/cblasfuncs.h
@@ -3,5 +3,7 @@
NPY_NO_EXPORT PyObject *
cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT int
+_bad_strides(PyArrayObject *ap);
#endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 909a24359..5ccb7f6d6 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -72,7 +72,6 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
*****************************************************************************
*/
#include "funcs.inc"
-#include "loops.h"
#include "umathmodule.h"
NPY_NO_EXPORT int initscalarmath(PyObject *);
@@ -2318,157 +2317,6 @@ fail:
return NULL;
}
-
-
-/*
- * matmul
- *
- * Implements the protocol used by the '@' operator defined in PEP 364.
- * Not in the NUMPY API at this time, maybe later.
- *
- *
- * in1: Left hand side operand
- * in2: Right hand side operand
- * out: Either NULL, or an array into which the output should be placed.
- *
- * Returns NULL on error.
- */
-static PyObject *
-array_matmul(PyObject *NPY_UNUSED(m), PyObject *args, PyObject* kwds)
-{
- PyObject *in1, *in2, *out = NULL;
- char* kwlist[] = {"a", "b", "out", NULL };
- PyArrayObject *ap1, *ap2, *ret = NULL;
- NPY_ORDER order = NPY_KEEPORDER;
- NPY_CASTING casting = NPY_SAFE_CASTING;
- PyArray_Descr *dtype;
- int nd1, nd2, typenum;
- char *subscripts;
- PyArrayObject *ops[2];
-
- if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO|O:matmul", kwlist,
- &in1, &in2, &out)) {
- return NULL;
- }
-
- if (out != NULL) {
- if (out == Py_None) {
- out = NULL;
- }
- else if (!PyArray_Check(out)) {
- PyErr_SetString(PyExc_TypeError, "'out' must be an array");
- return NULL;
- }
- }
-
- dtype = PyArray_DescrFromObject(in1, NULL);
- dtype = PyArray_DescrFromObject(in2, dtype);
- if (dtype == NULL) {
- if (!PyErr_Occurred()) {
- PyErr_SetString(PyExc_ValueError,
- "Cannot find a common data type.");
- }
- return NULL;
- }
- typenum = dtype->type_num;
-
- if (typenum == NPY_OBJECT) {
- /* matmul is not currently implemented for object arrays */
- PyErr_SetString(PyExc_TypeError,
- "Object arrays are not currently supported");
- Py_DECREF(dtype);
- return NULL;
- }
-
- ap1 = (PyArrayObject *)PyArray_FromAny(in1, dtype, 0, 0,
- NPY_ARRAY_ALIGNED, NULL);
- if (ap1 == NULL) {
- return NULL;
- }
-
- Py_INCREF(dtype);
- ap2 = (PyArrayObject *)PyArray_FromAny(in2, dtype, 0, 0,
- NPY_ARRAY_ALIGNED, NULL);
- if (ap2 == NULL) {
- Py_DECREF(ap1);
- return NULL;
- }
-
- if (PyArray_NDIM(ap1) == 0 || PyArray_NDIM(ap2) == 0) {
- /* Scalars are rejected */
- PyErr_SetString(PyExc_ValueError,
- "Scalar operands are not allowed, use '*' instead");
- return NULL;
- }
-
- nd1 = PyArray_NDIM(ap1);
- nd2 = PyArray_NDIM(ap2);
-
-#if defined(HAVE_CBLAS)
- if (nd1 <= 2 && nd2 <= 2 &&
- (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
- NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
- return cblas_matrixproduct(typenum, ap1, ap2, (PyArrayObject *)out);
- }
-#endif
-
- /*
- * Use einsum for the stacked cases. This is a quick implementation
- * to avoid setting up the proper iterators. Einsum broadcasts, so
- * we need to check dimensions before the call.
- */
- if (nd1 == 1 && nd2 == 1) {
- /* vector vector */
- if (PyArray_DIM(ap1, 0) != PyArray_DIM(ap2, 0)) {
- dot_alignment_error(ap1, 0, ap2, 0);
- goto fail;
- }
- subscripts = "i, i";
- }
- else if (nd1 == 1) {
- /* vector matrix */
- if (PyArray_DIM(ap1, 0) != PyArray_DIM(ap2, nd2 - 2)) {
- dot_alignment_error(ap1, 0, ap2, nd2 - 2);
- goto fail;
- }
- subscripts = "i, ...ij";
- }
- else if (nd2 == 1) {
- /* matrix vector */
- if (PyArray_DIM(ap1, nd1 - 1) != PyArray_DIM(ap2, 0)) {
- dot_alignment_error(ap1, nd1 - 1, ap2, 0);
- goto fail;
- }
- subscripts = "...i, i";
- }
- else {
- /* matrix * matrix */
- if (PyArray_DIM(ap1, nd1 - 1) != PyArray_DIM(ap2, nd2 - 2)) {
- dot_alignment_error(ap1, nd1 - 1, ap2, nd2 - 2);
- goto fail;
- }
- subscripts = "...ij, ...jk";
- }
- ops[0] = ap1;
- ops[1] = ap2;
- ret = PyArray_EinsteinSum(subscripts, 2, ops, NULL, order, casting,
- (PyArrayObject *)out);
- Py_DECREF(ap1);
- Py_DECREF(ap2);
-
- /* If no output was supplied, possibly convert to a scalar */
- if (ret != NULL && out == NULL) {
- return PyArray_Return((PyArrayObject *)ret);
- }
- return (PyObject *)ret;
-
-fail:
- Py_XDECREF(ap1);
- Py_XDECREF(ap2);
- return NULL;
-}
-
-
static int
einsum_sub_op_from_str(PyObject *args, PyObject **str_obj, char **subscripts,
PyArrayObject **op)
@@ -4285,9 +4133,6 @@ static struct PyMethodDef array_module_methods[] = {
{"vdot",
(PyCFunction)array_vdot,
METH_VARARGS | METH_KEYWORDS, NULL},
- {"matmul",
- (PyCFunction)array_matmul,
- METH_VARARGS | METH_KEYWORDS, NULL},
{"c_einsum",
(PyCFunction)array_einsum,
METH_VARARGS|METH_KEYWORDS, NULL},
diff --git a/numpy/core/src/multiarray/number.c b/numpy/core/src/multiarray/number.c
index 5ee536d4f..d153a8a64 100644
--- a/numpy/core/src/multiarray/number.c
+++ b/numpy/core/src/multiarray/number.c
@@ -112,6 +112,7 @@ _PyArray_SetNumericOps(PyObject *dict)
SET(minimum);
SET(rint);
SET(conjugate);
+ SET(matmul);
return 0;
}
@@ -177,6 +178,7 @@ _PyArray_GetNumericOps(void)
GET(minimum);
GET(rint);
GET(conjugate);
+ GET(matmul);
return dict;
fail:
@@ -382,14 +384,8 @@ array_divmod(PyArrayObject *m1, PyObject *m2)
static PyObject *
array_matrix_multiply(PyArrayObject *m1, PyObject *m2)
{
- static PyObject *matmul = NULL;
-
- npy_cache_import("numpy.core.multiarray", "matmul", &matmul);
- if (matmul == NULL) {
- return NULL;
- }
BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_matrix_multiply, array_matrix_multiply);
- return PyArray_GenericBinaryFunction(m1, m2, matmul);
+ return PyArray_GenericBinaryFunction(m1, m2, n_ops.matmul);
}
static PyObject *
diff --git a/numpy/core/src/multiarray/number.h b/numpy/core/src/multiarray/number.h
index fbdfe6f94..33a7cf872 100644
--- a/numpy/core/src/multiarray/number.h
+++ b/numpy/core/src/multiarray/number.h
@@ -39,6 +39,7 @@ typedef struct {
PyObject *minimum;
PyObject *rint;
PyObject *conjugate;
+ PyObject *matmul;
} NumericOps;
extern NPY_NO_EXPORT NumericOps n_ops;
diff --git a/numpy/core/src/multiarray/scalartypes.c.src b/numpy/core/src/multiarray/scalartypes.c.src
index 0f201b966..2f71c8ae9 100644
--- a/numpy/core/src/multiarray/scalartypes.c.src
+++ b/numpy/core/src/multiarray/scalartypes.c.src
@@ -1104,8 +1104,7 @@ static PyNumberMethods gentype_as_number = {
(binaryfunc)gentype_add, /*nb_add*/
(binaryfunc)gentype_subtract, /*nb_subtract*/
(binaryfunc)gentype_multiply, /*nb_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
(binaryfunc)gentype_divide, /*nb_divide*/
#endif
(binaryfunc)gentype_remainder, /*nb_remainder*/
@@ -1121,8 +1120,7 @@ static PyNumberMethods gentype_as_number = {
(binaryfunc)gentype_and, /*nb_and*/
(binaryfunc)gentype_xor, /*nb_xor*/
(binaryfunc)gentype_or, /*nb_or*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
0, /*nb_coerce*/
#endif
(unaryfunc)gentype_int, /*nb_int*/
@@ -1132,16 +1130,14 @@ static PyNumberMethods gentype_as_number = {
(unaryfunc)gentype_long, /*nb_long*/
#endif
(unaryfunc)gentype_float, /*nb_float*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
(unaryfunc)gentype_oct, /*nb_oct*/
(unaryfunc)gentype_hex, /*nb_hex*/
#endif
0, /*inplace_add*/
0, /*inplace_subtract*/
0, /*inplace_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
0, /*inplace_divide*/
#endif
0, /*inplace_remainder*/
@@ -1156,6 +1152,10 @@ static PyNumberMethods gentype_as_number = {
0, /*nb_inplace_floor_divide*/
0, /*nb_inplace_true_divide*/
(unaryfunc)NULL, /*nb_index*/
+#if PY_VERSION_HEX >= 0x03050000
+ 0, /*np_matmul*/
+ 0, /*np_inplace_matmul*/
+#endif
};
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
new file mode 100644
index 000000000..5f5bd3506
--- /dev/null
+++ b/numpy/core/src/umath/matmul.c.src
@@ -0,0 +1,403 @@
+/* -*- c -*- */
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "Python.h"
+
+#include "npy_config.h"
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_cblas.h"
+#include "arraytypes.h" /* For TYPE_dot functions */
+#include <assert.h>
+
+/*
+ *****************************************************************************
+ ** BASICS **
+ *****************************************************************************
+ */
+
+#if defined(HAVE_CBLAS)
+static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
+static const npy_cfloat oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
+
+/**begin repeat
+ *
+ * #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
+ * #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #type = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #prefix = s, d, c, z#
+ * #step1 = 1.F, 1., &oneF, &oneD#
+ * #step0 = 0.F, 0., &zeroF, &zeroD#
+ */
+NPY_NO_EXPORT void
+@name@_gemv(void *ip1, npy_intp is1_m, void *ip2, npy_intp is2_n, void *op,
+ npy_intp m, npy_intp n)
+{
+ /*
+ * Vector matrix multiplication -- Level 2 BLAS
+ * arguments
+ * ip1: contiguous data, m*n shape
+ * ip2: data in c order, n*1 shape
+ * op: contiguous data in c order, m shape
+ */
+ enum CBLAS_ORDER order;
+ int lda;
+
+ if (is1_m == sizeof(@type@)) {
+ order = CblasColMajor;
+ lda = n;
+ }
+ else {
+ /* If not ColMajor, caller should have ensured we are RowMajor */
+ /* will not assert in release mode */
+ assert(is1_m == sizeof(@type@) * m);
+ order = CblasRowMajor;
+ lda = m;
+ }
+ cblas_@prefix@gemv(order, CblasTrans, n, m, @step1@, ip1, lda, ip2,
+ is2_n / sizeof(@type@), @step0@, op, 1);
+}
+
+NPY_NO_EXPORT void
+@name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+ void *ip2, npy_intp is2_n, npy_intp is2_p,
+ void *op, npy_intp m, npy_intp n, npy_intp p)
+{
+ /*
+ * matrix matrix multiplication -- Level 3 BLAS
+ */
+ enum CBLAS_ORDER order = CblasRowMajor;
+ enum CBLAS_TRANSPOSE trans1, trans2;
+ int M, N, P, lda, ldb;
+ M = m;
+ N = n;
+ P = p;
+
+ if (is1_m == sizeof(@type@)) {
+ trans1 = CblasTrans;
+ lda = N > 1 ? is1_n / sizeof(@type@) : 1;
+ }
+ else {
+ /* If not ColMajor, caller should have ensured we are RowMajor */
+ /* will not assert in release mode */
+ assert(is1_n == sizeof(@type@));
+ trans1 = CblasNoTrans;
+ lda = N > 1 ? is1_m / sizeof(@type@) : 1;
+ }
+ if (is2_n == sizeof(@type@)) {
+ trans2 = CblasTrans;
+ ldb = N > 1 ? is2_p / sizeof(@type@) : 1;
+ }
+ else {
+ /* If not ColMajor, caller should have ensured we are RowMajor */
+ /* will not assert in release mode */
+ assert(is2_p == sizeof(@type@));
+ trans2 = CblasNoTrans;
+ ldb = P > 1 ? P : 1;
+ }
+ /*
+ * Use syrk if we have a case of a matrix times its transpose.
+ * Otherwise, use gemm for all other cases.
+ */
+ if (
+ (ip1 == ip2) &&
+ (m == p) &&
+ (is1_m == is2_p) &&
+ (is1_n == is2_n) &&
+ (trans1 != trans2)
+ ) {
+ npy_intp i,j;
+ if (trans1 == CblasNoTrans) {
+ cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
+ ip1, lda, @step0@, op, P);
+ }
+ else {
+ cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
+ ip1, ldb, @step0@, op, P);
+ }
+ /* Copy the triangle */
+ for (i = 0; i < P; i++) {
+ for (j = i + 1; j < P; j++) {
+ ((@type@*)op)[j * P + i] = ((@type@*)op)[i * P + j];
+ }
+ }
+
+ }
+ else {
+ cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda,
+ ip2, ldb, @step0@, op, P);
+ }
+}
+
+/**end repeat**/
+#endif
+
+/*
+ * matmul loops
+ * signature is (m?,n),(n,p?)->(m?,p?)
+ */
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, HALF#
+ * #typ = npy_float,npy_double,npy_half#
+ * #SPECL = 0,0,1#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
+ npy_intp dm, npy_intp dn, npy_intp dp,
+ npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
+ npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
+ npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
+ npy_intp os_p)
+{
+ npy_intp m, n, p;
+ for (m = 0; m < dm; m++) {
+ for (p = 0; p < dp; p++) {
+ /*
+ * Use a double as an intermediate sum, which is natural for
+ * npy_double, slightly increases the accuracy of npy_float,
+ * and is perhaps overkill for npy_half.
+ */
+ double sum = 0;
+ for (n = 0; n < dn; n++) {
+#if @SPECL@ == 1
+ @typ@ val1 = (*(@typ@ *)ip1);
+ @typ@ val2 = (*(@typ@ *)ip2);
+ sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#else
+ sum += ((double)*(@typ@ *)ip1) * ((double)*(@typ@ *)ip2);
+#endif
+ ip2 += is2_n;
+ ip1 += is1_n;
+ }
+#if @SPECL@ == 1
+ *(@typ@ *)op = npy_float_to_half((float)sum);
+#else
+ /* in the case of double -> float, may produce INF */
+ *(@typ@ *)op = (@typ@)sum;
+#endif
+ ip1 -= ib1_n;
+ ip2 -= ib2_n;
+ op += os_p;
+ ip2 += is2_p;
+ }
+ op -= ob_p;
+ ip2 -= ib2_p;
+ ip1 += is1_m;
+ op += os_m;
+ }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = LONGDOUBLE,
+ * CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * BOOL#
+ * #typ = npy_longdouble,
+ * npy_cfloat, npy_cdouble, npy_clongdouble,
+ * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_bool#
+ * #IS_COMPLEX = 0, 1, 1, 1, 0*11#
+ */
+
+NPY_NO_EXPORT void
+@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
+ npy_intp dm, npy_intp dn, npy_intp dp,
+ npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
+ npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
+ npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
+ npy_intp os_p)
+{
+ npy_intp m, n, p;
+ for (m = 0; m < dm; m++) {
+ for (p = 0; p < dp; p++) {
+#if @IS_COMPLEX@ == 1
+ (*(@typ@ *)op).real = 0;
+ (*(@typ@ *)op).imag = 0;
+#else
+ *(@typ@ *)op = 0;
+#endif
+ for (n = 0; n < dn; n++) {
+ @typ@ val1 = (*(@typ@ *)ip1);
+ @typ@ val2 = (*(@typ@ *)ip2);
+#if @IS_COMPLEX@ == 1
+ (*(@typ@ *)op).real += (val1.real * val2.real) -
+ (val1.imag * val2.imag);
+ (*(@typ@ *)op).imag += (val1.real * val2.imag) +
+ (val1.imag * val2.real);
+#else
+ *(@typ@ *)op += val1 * val2;
+#endif
+ ip2 += is2_n;
+ ip1 += is1_n;
+ }
+ ip1 -= ib1_n;
+ ip2 -= ib2_n;
+ op += os_p;
+ ip2 += is2_p;
+ }
+ op -= ob_p;
+ ip2 -= ib2_p;
+ ip1 += is1_m;
+ op += os_m;
+ }
+}
+
+/**end repeat**/
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
+ * CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * BOOL#
+ * #typ = npy_float,npy_double,npy_longdouble, npy_half,
+ * npy_cfloat, npy_cdouble, npy_clongdouble,
+ * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
+ * npy_byte, npy_short, npy_int, npy_long, npy_longlong,
+ * npy_bool#
+ * #SPECL = 0, 0, 0, 2, 1, 1, 1, 0*11#
+ * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*12#
+ * #chr = s, d, 0, 0, c, z, 0*12#
+ * #blas_typ = npy_float, npy_double, void*16#
+ */
+
+
+NPY_NO_EXPORT void
+@TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func))
+{
+ npy_intp dOuter = *dimensions++;
+ npy_intp iOuter;
+ npy_intp s0 = *steps++;
+ npy_intp s1 = *steps++;
+ npy_intp s2 = *steps++;
+ npy_intp dm = dimensions[0];
+ npy_intp dn = dimensions[1];
+ npy_intp dp = dimensions[2];
+ npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+ os_m=steps[4], os_p=steps[5];
+ npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+#if @USEBLAS@ & defined(HAVE_CBLAS)
+ npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+ npy_bool scalar_out = (dm ==1 && dp == 1);
+ npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+ npy_bool too_big_for_blas = (dm > NPY_MAX_INT || dn > NPY_MAX_INT ||
+ dp >= NPY_MAX_INT);
+ npy_bool input_contiguous = ((is1_m == sizeof(@typ@) ||
+ is1_n == sizeof(@typ@)) &&
+ (is2_n == sizeof(@typ@) ||
+ is2_p == sizeof(@typ@)));
+ npy_bool vector_matrix = ((dm == 1) &&
+ (is2_n == sizeof(@typ@) || (is2_n == sizeof(@typ@) * dp)));
+ npy_bool matrix_vector = ((dp == 1) &&
+ (is1_n == sizeof(@typ@) || (is1_n == sizeof(@typ@) * dm)));
+#endif
+
+ ib1_n = is1_n*dn;
+ ib2_n = is2_n*dn;
+ ib2_p = is2_p*dp;
+ ob_p = os_p *dp;
+
+ if (dn == 0) {
+ /* No operand, need to zero the output */
+ for (iOuter = 0; iOuter < dOuter; iOuter++,
+ args[0] += s0, args[1] += s1, args[2] += s2) {
+ npy_intp m, p;
+ char *op=args[2];
+ for (m = 0; m < dm; m++) {
+ for (p = 0; p < dp; p++) {
+#if @SPECL@ == 1
+ (*(@typ@ *)op).real = 0;
+ (*(@typ@ *)op).imag = 0;
+#else
+ *(@typ@ *)op = 0;
+#endif
+ op += os_p;
+ }
+ op += os_m - ob_p;
+ }
+ }
+ return;
+ }
+ for (iOuter = 0; iOuter < dOuter; iOuter++,
+ args[0] += s0, args[1] += s1, args[2] += s2) {
+ char *ip1=args[0], *ip2=args[1], *op=args[2];
+#if @USEBLAS@ & defined(HAVE_CBLAS)
+ /*
+ * TODO: refactor this out to a inner_loop_selector, in
+ * PyUFunc_MatmulLoopSelector. But that call does not have access to
+ * n, m, p and strides.
+ */
+ if (too_big_for_blas) {
+ @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+ ib1_n, ib2_n, ib2_p, ob_p,
+ is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ }
+ else if (special_case) {
+ /* Special case variants that have a 1 in the core dimensions */
+ if (scalar_out) {
+ /* row @ column, 1,1 output */
+ @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+ } else if (scalar_vec){
+ /*
+ * 0d @ vector or vector @ 0d
+ * could use cblas_Xaxy, but that requires 0ing output
+ * and would not be faster (XXX prove it)
+ */
+ @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+ ib1_n, ib2_n, ib2_p, ob_p,
+ is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ } else if (vector_matrix) {
+ /* vector @ matrix, switch ip1, ip2, p and m */
+ @TYPE@_gemv((void*)ip2, is2_n, (void*)ip1, is1_n,
+ (void*)op, dp, dn);
+ } else if (matrix_vector) {
+ /* matrix @ vector */
+ @TYPE@_gemv((void*)ip1, is1_n, (void*)ip2, is2_n,
+ (void*)op, dm, dn);
+ } else {
+ /* column @ row, 2d output, no blas needed or non-contiguous input */
+ @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+ ib1_n, ib2_n, ib2_p, ob_p,
+ is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ }
+ } else {
+ /* matrix @ matrix */
+ if (input_contiguous) {
+ /* can only use blas if input is contiguous */
+ @TYPE@_matmul_matrixmatrix((void*)ip1, is1_m, is1_n,
+ (void*)ip2, is2_n, is2_p,
+ (void*)op, dm, dn, dp);
+ } else {
+ @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+ ib1_n, ib2_n, ib2_p, ob_p,
+ is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ }
+ }
+#else
+ @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
+ ib1_n, ib2_n, ib2_p, ob_p,
+ is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+
+#endif
+ }
+}
+
+/**end repeat**/
+
+
diff --git a/numpy/core/src/umath/matmul.h.src b/numpy/core/src/umath/matmul.h.src
new file mode 100644
index 000000000..16be7675b
--- /dev/null
+++ b/numpy/core/src/umath/matmul.h.src
@@ -0,0 +1,12 @@
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE, HALF,
+ * CFLOAT, CDOUBLE, CLONGDOUBLE,
+ * UBYTE, USHORT, UINT, ULONG, ULONGLONG,
+ * BYTE, SHORT, INT, LONG, LONGLONG,
+ * BOOL#
+ **/
+NPY_NO_EXPORT void
+@TYPE@_matmul(char **args, npy_intp *dimensions, npy_intp *steps, void *NPY_UNUSED(func));
+/**end repeat**/
+
+
diff --git a/numpy/core/src/umath/scalarmath.c.src b/numpy/core/src/umath/scalarmath.c.src
index e98d9f865..a7987acda 100644
--- a/numpy/core/src/umath/scalarmath.c.src
+++ b/numpy/core/src/umath/scalarmath.c.src
@@ -1564,7 +1564,6 @@ static PyObject*
}
/**end repeat**/
-
/**begin repeat
* #name = byte, ubyte, short, ushort, int, uint,
* long, ulong, longlong, ulonglong,
@@ -1575,8 +1574,7 @@ static PyNumberMethods @name@_as_number = {
(binaryfunc)@name@_add, /*nb_add*/
(binaryfunc)@name@_subtract, /*nb_subtract*/
(binaryfunc)@name@_multiply, /*nb_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
(binaryfunc)@name@_divide, /*nb_divide*/
#endif
(binaryfunc)@name@_remainder, /*nb_remainder*/
@@ -1596,8 +1594,7 @@ static PyNumberMethods @name@_as_number = {
(binaryfunc)@name@_and, /*nb_and*/
(binaryfunc)@name@_xor, /*nb_xor*/
(binaryfunc)@name@_or, /*nb_or*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
0, /*nb_coerce*/
#endif
(unaryfunc)@name@_int, /*nb_int*/
@@ -1607,16 +1604,14 @@ static PyNumberMethods @name@_as_number = {
(unaryfunc)@name@_long, /*nb_long*/
#endif
(unaryfunc)@name@_float, /*nb_float*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
(unaryfunc)@name@_oct, /*nb_oct*/
(unaryfunc)@name@_hex, /*nb_hex*/
#endif
0, /*inplace_add*/
0, /*inplace_subtract*/
0, /*inplace_multiply*/
-#if defined(NPY_PY3K)
-#else
+#if !defined(NPY_PY3K)
0, /*inplace_divide*/
#endif
0, /*inplace_remainder*/
@@ -1631,6 +1626,10 @@ static PyNumberMethods @name@_as_number = {
0, /*nb_inplace_floor_divide*/
0, /*nb_inplace_true_divide*/
(unaryfunc)NULL, /*nb_index*/
+#if PY_VERSION_HEX >= 0x03050000
+ 0, /*nb_matrix_multiply*/
+ 0, /*nb_inplace_matrix_multiply*/
+#endif
};
/**end repeat**/
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 1fe8745a0..66f512f7b 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2845,13 +2845,15 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
}
/* Fill in any allocated outputs */
- for (i = nin; i < nop; ++i) {
- if (op[i] == NULL) {
- op[i] = NpyIter_GetOperandArray(iter)[i];
- Py_INCREF(op[i]);
+ {
+ PyArrayObject **operands = NpyIter_GetOperandArray(iter);
+ for (i = 0; i < nop; ++i) {
+ if (op[i] == NULL) {
+ op[i] = operands[i];
+ Py_INCREF(op[i]);
+ }
}
}
-
/*
* Set up the inner strides array. Because we're not doing
* buffering, the strides are fixed throughout the looping.
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index 6b042d837..efd923972 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -22,6 +22,11 @@
#include "ufunc_object.h"
#include "common.h"
+#include "mem_overlap.h"
+#if defined(HAVE_CBLAS)
+#include "cblasfuncs.h"
+#endif
+
static const char *
npy_casting_to_string(NPY_CASTING casting)
{
@@ -1299,6 +1304,88 @@ PyUFunc_MixedDivisionTypeResolver(PyUFuncObject *ufunc,
type_tup, out_dtypes);
}
+/*
+ * XXX This is too restrictive, we should only check the inner axis involved
+ * gh-12365
+ */
+#define NOT_CONTIGUOUS(a) (PyArray_NDIM(a) >=2 && !( \
+ PyArray_IS_C_CONTIGUOUS(a) || PyArray_IS_F_CONTIGUOUS(a)))
+
+/*
+ * This function applies special type resolution rules for the case
+ * where all the functions have the pattern XX->X, using
+ * PyArray_ResultType instead of a linear search to get the best
+ * loop, like PyUFunc_SimpleBinaryOperationTypeResolver, and adds
+ * memory overlap and contiguity considerations to the operands,
+ * possibly creating Writeback temporary data
+ *
+ * Returns 0 on success, -1 on error.
+ */
+NPY_NO_EXPORT int
+PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
+ NPY_CASTING casting,
+ PyArrayObject **operands,
+ PyObject *type_tup,
+ PyArray_Descr **out_dtypes)
+{
+
+ if (PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting, operands,
+ type_tup, out_dtypes) < 0) {
+ return -1;
+ }
+ if (PyArray_NDIM(operands[0]) >= 2 || PyArray_NDIM(operands[1]) >= 2) {
+#if defined(HAVE_CBLAS)
+ int typenum = out_dtypes[2]->type_num;
+ if ( (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
+ NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
+ /*
+ * We are going to use BLAS
+ * make sure 2d and more arrays are contiguous
+ */
+ if (_bad_strides(operands[0]) || NOT_CONTIGUOUS(operands[0])) {
+ PyObject *op = PyArray_NewCopy(operands[0], NPY_ANYORDER);
+
+ if (op == NULL) {
+ return -1;
+ }
+ Py_DECREF(operands[0]);
+ operands[0] = (PyArrayObject *)op;
+ }
+ if (_bad_strides(operands[1]) || NOT_CONTIGUOUS(operands[1])) {
+ PyObject *op = PyArray_NewCopy(operands[1], NPY_ANYORDER);
+
+ if (op == NULL) {
+ return -1;
+ }
+ Py_DECREF(operands[1]);
+ operands[1] = (PyArrayObject *)op;
+ }
+ }
+#endif
+ if (operands[2] != NULL) {
+ npy_intp last_stride = PyArray_STRIDE(operands[2],
+ PyArray_NDIM(operands[2]) - 1);
+ if (last_stride != PyArray_ITEMSIZE(operands[2])) {
+ PyErr_SetString(PyExc_ValueError,
+ "output array is not acceptable (must be C-contiguous)");
+ return -1;
+ }
+ /*
+ * Use all the flags in PyUFunc_GeneralizedFunction
+ * except NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE and
+ * NPY_ITER_WRITEONLY. Without NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
+ * nditer will properly check overlap between output and any input.
+ */
+ ufunc->op_flags[2] = NPY_ITER_WRITEONLY |
+ NPY_ITER_UPDATEIFCOPY |
+ NPY_ITER_ALIGNED |
+ NPY_ITER_ALLOCATE |
+ NPY_ITER_NO_BROADCAST;
+ }
+ }
+ return 0;
+}
+
static int
find_userloop(PyUFuncObject *ufunc,
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index bb4823d24..5306a5983 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -145,5 +145,11 @@ PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
NpyAuxData **out_innerloopdata,
int *out_needs_api);
+NPY_NO_EXPORT int
+PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
+ NPY_CASTING casting,
+ PyArrayObject **operands,
+ PyObject *type_tup,
+ PyArray_Descr **out_dtypes);
#endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 51fe6e9ef..80c69d9b3 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -2595,7 +2595,8 @@ class TestMethods(object):
assert_equal(x1.flatten('F'), y1f)
assert_equal(x1.flatten('F'), x1.T.flatten())
- def test_dot(self):
+ @pytest.mark.parametrize('func', (np.dot, np.matmul))
+ def test_arr_mult(self, func):
a = np.array([[1, 0], [0, 1]])
b = np.array([[0, 1], [1, 0]])
c = np.array([[9, 1], [1, -9]])
@@ -2619,49 +2620,49 @@ class TestMethods(object):
# gemm vs syrk optimizations
for et in [np.float32, np.float64, np.complex64, np.complex128]:
eaf = a.astype(et)
- assert_equal(np.dot(eaf, eaf), eaf)
- assert_equal(np.dot(eaf.T, eaf), eaf)
- assert_equal(np.dot(eaf, eaf.T), eaf)
- assert_equal(np.dot(eaf.T, eaf.T), eaf)
- assert_equal(np.dot(eaf.T.copy(), eaf), eaf)
- assert_equal(np.dot(eaf, eaf.T.copy()), eaf)
- assert_equal(np.dot(eaf.T.copy(), eaf.T.copy()), eaf)
+ assert_equal(func(eaf, eaf), eaf)
+ assert_equal(func(eaf.T, eaf), eaf)
+ assert_equal(func(eaf, eaf.T), eaf)
+ assert_equal(func(eaf.T, eaf.T), eaf)
+ assert_equal(func(eaf.T.copy(), eaf), eaf)
+ assert_equal(func(eaf, eaf.T.copy()), eaf)
+ assert_equal(func(eaf.T.copy(), eaf.T.copy()), eaf)
# syrk validations
for et in [np.float32, np.float64, np.complex64, np.complex128]:
eaf = a.astype(et)
ebf = b.astype(et)
- assert_equal(np.dot(ebf, ebf), eaf)
- assert_equal(np.dot(ebf.T, ebf), eaf)
- assert_equal(np.dot(ebf, ebf.T), eaf)
- assert_equal(np.dot(ebf.T, ebf.T), eaf)
+ assert_equal(func(ebf, ebf), eaf)
+ assert_equal(func(ebf.T, ebf), eaf)
+ assert_equal(func(ebf, ebf.T), eaf)
+ assert_equal(func(ebf.T, ebf.T), eaf)
# syrk - different shape, stride, and view validations
for et in [np.float32, np.float64, np.complex64, np.complex128]:
edf = d.astype(et)
assert_equal(
- np.dot(edf[::-1, :], edf.T),
- np.dot(edf[::-1, :].copy(), edf.T.copy())
+ func(edf[::-1, :], edf.T),
+ func(edf[::-1, :].copy(), edf.T.copy())
)
assert_equal(
- np.dot(edf[:, ::-1], edf.T),
- np.dot(edf[:, ::-1].copy(), edf.T.copy())
+ func(edf[:, ::-1], edf.T),
+ func(edf[:, ::-1].copy(), edf.T.copy())
)
assert_equal(
- np.dot(edf, edf[::-1, :].T),
- np.dot(edf, edf[::-1, :].T.copy())
+ func(edf, edf[::-1, :].T),
+ func(edf, edf[::-1, :].T.copy())
)
assert_equal(
- np.dot(edf, edf[:, ::-1].T),
- np.dot(edf, edf[:, ::-1].T.copy())
+ func(edf, edf[:, ::-1].T),
+ func(edf, edf[:, ::-1].T.copy())
)
assert_equal(
- np.dot(edf[:edf.shape[0] // 2, :], edf[::2, :].T),
- np.dot(edf[:edf.shape[0] // 2, :].copy(), edf[::2, :].T.copy())
+ func(edf[:edf.shape[0] // 2, :], edf[::2, :].T),
+ func(edf[:edf.shape[0] // 2, :].copy(), edf[::2, :].T.copy())
)
assert_equal(
- np.dot(edf[::2, :], edf[:edf.shape[0] // 2, :].T),
- np.dot(edf[::2, :].copy(), edf[:edf.shape[0] // 2, :].T.copy())
+ func(edf[::2, :], edf[:edf.shape[0] // 2, :].T),
+ func(edf[::2, :].copy(), edf[:edf.shape[0] // 2, :].T.copy())
)
# syrk - different shape
@@ -2669,9 +2670,13 @@ class TestMethods(object):
edf = d.astype(et)
eddtf = ddt.astype(et)
edtdf = dtd.astype(et)
- assert_equal(np.dot(edf, edf.T), eddtf)
- assert_equal(np.dot(edf.T, edf), edtdf)
+ assert_equal(func(edf, edf.T), eddtf)
+ assert_equal(func(edf.T, edf), edtdf)
+ def test_dot(self):
+ a = np.array([[1, 0], [0, 1]])
+ b = np.array([[0, 1], [1, 0]])
+ c = np.array([[9, 1], [1, -9]])
# function versus methods
assert_equal(np.dot(a, b), a.dot(b))
assert_equal(np.dot(np.dot(a, b), c), a.dot(b).dot(c))
@@ -2738,6 +2743,18 @@ class TestMethods(object):
assert_raises(NotImplementedError, np.matmul, A(), A())
assert_raises(NotImplementedError, np.inner, A(), A())
+ def test_matmul_out(self):
+ # overlapping memory
+ a = np.arange(18).reshape(2, 3, 3)
+ b = np.matmul(a, a)
+ c = np.matmul(a, a, out=a)
+ assert_(c is a)
+ assert_equal(c, b)
+ a = np.arange(18).reshape(2, 3, 3)
+ c = np.matmul(a, a, out=a[::-1, ...])
+ assert_(c.base is a.base)
+ assert_equal(c, b)
+
def test_diagonal(self):
a = np.arange(12).reshape((3, 4))
assert_equal(a.diagonal(), [0, 5, 10])
@@ -3147,6 +3164,8 @@ class TestBinop(object):
# 'eq': (np.equal, False),
# 'ne': (np.not_equal, False),
}
+ if sys.version_info >= (3, 5):
+ ops['matmul'] = (np.matmul, False, float)
class Coerced(Exception):
pass
@@ -3189,7 +3208,7 @@ class TestBinop(object):
if issubclass(MyType, np.ndarray):
# Use this range to avoid special case weirdnesses around
# divide-by-0, pow(x, 2), overflow due to pow(big, big), etc.
- return np.arange(3, 5).view(MyType)
+ return np.arange(3, 7).reshape(2, 2).view(MyType)
else:
return MyType()
@@ -3198,7 +3217,7 @@ class TestBinop(object):
for op, (ufunc, has_inplace, dtype) in ops.items():
err_msg = ('op: %s, ufunc: %s, has_inplace: %s, dtype: %s'
% (op, ufunc, has_inplace, dtype))
- check_objs = [np.arange(3, 5, dtype=dtype)]
+ check_objs = [np.arange(3, 7, dtype=dtype).reshape(2, 2)]
if check_scalar:
check_objs.append(check_objs[0][0])
for arr in check_objs:
@@ -5697,13 +5716,36 @@ class MatmulCommon(object):
res = self.matmul(v, v)
assert_(type(res) is np.dtype(dt).type)
- def test_vector_vector_values(self):
- vec = np.array([1, 2])
- tgt = 5
+ def test_0d_vector_values(self):
+ vec1 = np.array([2])
+ vec2 = np.array([3, 4]).reshape(1, -1)
+ tgt = np.array([6, 8])
for dt in self.types[1:]:
- v1 = vec.astype(dt)
- res = self.matmul(v1, v1)
+ v1 = vec1.astype(dt)
+ v2 = vec2.astype(dt)
+ res = self.matmul(v1, v2)
assert_equal(res, tgt)
+ res = self.matmul(v2.T, v1)
+ assert_equal(res, tgt)
+
+ # boolean type
+ vec = np.array([True, True], dtype='?').reshape(1, -1)
+ res = self.matmul(vec[:, 0], vec)
+ assert_equal(res, True)
+
+ def test_vector_vector_values(self):
+ vec1 = np.array([1, 2])
+ vec2 = np.array([3, 4]).reshape(-1, 1)
+ tgt1 = np.array([11])
+ tgt2 = np.array([[3, 6], [4, 8]])
+ for dt in self.types[1:]:
+ v1 = vec1.astype(dt)
+ v2 = vec2.astype(dt)
+ res = self.matmul(v1, v2)
+ assert_equal(res, tgt1)
+ # no broadcast, we must make v1 into a 2d ndarray
+ res = self.matmul(v2, v1.reshape(1, -1))
+ assert_equal(res, tgt2)
# boolean type
vec = np.array([True, True], dtype='?')
@@ -5851,27 +5893,52 @@ class TestMatmul(MatmulCommon):
assert_array_equal(out, tgt, err_msg=msg)
# test out with not allowed type cast (safe casting)
- # einsum and cblas raise different error types, so
- # use Exception.
- msg = "out argument with illegal cast"
+ msg = "Cannot cast ufunc matmul output"
out = np.zeros((2, 2), dtype=np.int32)
- assert_raises(Exception, self.matmul, a, b, out=out)
+ assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out)
- # skip following tests for now, cblas does not allow non-contiguous
+ # cblas does not allow non-contiguous
# outputs and consistency with dot would require same type,
# dimensions, subtype, and c_contiguous.
- # test out with allowed type cast
- # msg = "out argument with allowed cast"
- # out = np.zeros((2, 2), dtype=np.complex128)
- # self.matmul(a, b, out=out)
- # assert_array_equal(out, tgt, err_msg=msg)
+ # test out with type upcast to complex
+ out = np.zeros((2, 2), dtype=np.complex128)
+ c = self.matmul(a, b, out=out)
+ assert_(c is out)
+ with suppress_warnings() as sup:
+ sup.filter(np.ComplexWarning, '')
+ c = c.astype(tgt.dtype)
+ assert_array_equal(c, tgt)
# test out non-contiguous
- # msg = "out argument with non-contiguous layout"
- # c = np.zeros((2, 2, 2), dtype=float)
- # self.matmul(a, b, out=c[..., 0])
- # assert_array_equal(c, tgt, err_msg=msg)
+ out = np.ones((2, 2, 2), dtype=float)
+ assert_raises(ValueError, self.matmul, a, b, out=out[..., 0])
+
+ m1 = np.arange(15.).reshape(5, 3)
+ m2 = np.arange(21.).reshape(3, 7)
+ m3 = np.arange(30.).reshape(5, 6)[:, ::2] # non-contiguous
+ vc = np.arange(10.)
+ vr = np.arange(6.)
+ @pytest.mark.parametrize('args', (
+ # matrix-matrix
+ (m1, m2), (m2.T, m1.T), (m2.T.copy(), m1.T), (m2.T, m1.T.copy()),
+ # matrix-matrix-transpose, contiguous and non
+ (m1, m1.T), (m1.T, m1), (m1, m3.T), (m3, m1.T),
+ (m3, m3.T), (m3.T, m3),
+ # matrix-matrix non-contiguous
+ (m3, m2), (m2.T, m3.T), (m2.T.copy(), m3.T),
+ # vector-matrix, matrix-vector, contiguous
+ (m1, vr[:3]), (vc[:5], m1), (m1.T, vc[:5]), (vr[:3], m1.T),
+ # vector-matrix, matrix-vector, vector non-contiguous
+ (m1, vr[::2]), (vc[::2], m1), (m1.T, vc[::2]), (vr[::2], m1.T),
+ # vector-matrix, matrix-vector, matrix non-contiguous
+ (m3, vr[:3]), (vc[:5], m3), (m3.T, vc[:5]), (vr[:3], m3.T),
+ # vector-matrix, matrix-vector, both non-contiguous
+ (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T)))
+ def test_dot_equivalent(self, args):
+ r1 = np.matmul(*args)
+ r2 = np.dot(*args)
+ assert_equal(r1, r2)
if sys.version_info[:2] >= (3, 5):
@@ -5895,6 +5962,11 @@ if sys.version_info[:2] >= (3, 5):
assert_equal(self.matmul(a, b), "A")
assert_equal(self.matmul(b, a), "A")
+ def test_matmul_raises(self):
+ assert_raises(TypeError, self.matmul, np.int8(5), np.int8(5))
+ assert_raises(TypeError, self.matmul, np.void(b'abc'), np.void(b'abc'))
+ assert_raises(ValueError, self.matmul, np.arange(10), np.void(b'abc'))
+
def test_matmul_inplace():
# It would be nice to support in-place matmul eventually, but for now
# we don't have a working implementation, so better just to error out
@@ -5909,6 +5981,17 @@ if sys.version_info[:2] >= (3, 5):
exec_ = getattr(builtins, "exec")
assert_raises(TypeError, exec_, "a @= b", globals(), locals())
+ def test_matmul_axes():
+ a = np.arange(3*4*5).reshape(3, 4, 5)
+ c = np.matmul(a, a, axes=[(-2, -1), (-1, -2), (1, 2)])
+ assert c.shape == (3, 4, 4)
+ d = np.matmul(a, a, axes=[(-2, -1), (-1, -2), (0, 1)])
+ assert d.shape == (4, 4, 3)
+ e = np.swapaxes(d, 0, 2)
+ assert_array_equal(e, c)
+ f = np.matmul(a, np.arange(3), axes=[(1, 0), (0), (0)])
+ assert f.shape == (4, 5)
+
class TestInner(object):