summaryrefslogtreecommitdiff
path: root/numpy/core
diff options
context:
space:
mode:
authormattip <matti.picus@gmail.com>2018-11-25 10:46:21 -0600
committermattip <matti.picus@gmail.com>2018-12-01 16:13:59 -0800
commit72f2abf34a43cf3642acf6a6ff951148051404ae (patch)
treed6df709bea86487caf12595f81f6ca76e7e8a373 /numpy/core
parent8c9450a7fd69d5b74b47ffec60b5c235361daeff (diff)
downloadnumpy-72f2abf34a43cf3642acf6a6ff951148051404ae.tar.gz
ENH: corrections and fixes from review
Diffstat (limited to 'numpy/core')
-rw-r--r--numpy/core/code_generators/generate_umath.py14
-rw-r--r--numpy/core/code_generators/ufunc_docstrings.py16
-rw-r--r--numpy/core/src/common/cblasfuncs.c1
-rw-r--r--numpy/core/src/common/cblasfuncs.h2
-rw-r--r--numpy/core/src/multiarray/multiarraymodule.c5
-rw-r--r--numpy/core/src/umath/matmul.c.src327
-rw-r--r--numpy/core/src/umath/ufunc_object.c17
-rw-r--r--numpy/core/src/umath/ufunc_type_resolution.c83
-rw-r--r--numpy/core/src/umath/ufunc_type_resolution.h7
-rw-r--r--numpy/core/tests/test_multiarray.py57
10 files changed, 242 insertions, 287 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index a092cb95e..7522de2ad 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -136,7 +136,7 @@ class Ufunc(object):
self.docstring = docstring
self.typereso = typereso
self.type_descriptions = []
- self.signature = kwargs.pop('signature', 'NULL')
+ self.signature = kwargs.pop('signature', None)
for td in type_descriptions:
self.type_descriptions.extend(td)
for td in self.type_descriptions:
@@ -911,9 +911,9 @@ defdict = {
'matmul' :
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.matmul'),
- "PyUFunc_MatmulTypeResolver",
+ "PyUFunc_SimpleBinaryOperationTypeResolver",
TD(notimes_or_obj),
- signature='"(n?,k),(k,m?)->(n?,m?)"',
+ signature='(n?,k),(k,m?)->(n?,m?)',
),
}
@@ -1060,6 +1060,10 @@ def make_ufuncs(funcdict):
# string literal in C code. We split at endlines because textwrap.wrap
# do not play well with \n
docstring = '\\n\"\"'.join(docstring.split(r"\n"))
+ if uf.signature is None:
+ sig = "NULL"
+ else:
+ sig = '"{}"'.format(uf.signature)
fmt = textwrap.dedent("""\
identity = {identity_expr};
if ({has_identity} && identity == NULL) {{
@@ -1068,7 +1072,7 @@ def make_ufuncs(funcdict):
f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
{name}_functions, {name}_data, {name}_signatures, {nloops},
{nin}, {nout}, {identity}, "{name}",
- "{doc}", 0, {signature}, identity
+ "{doc}", 0, {sig}, identity
);
if ({has_identity}) {{
Py_DECREF(identity);
@@ -1084,7 +1088,7 @@ def make_ufuncs(funcdict):
identity='PyUFunc_IdentityValue',
identity_expr=uf.identity,
doc=docstring,
- signature=uf.signature,
+ sig=sig,
)
# Only PyUFunc_None means don't reorder - we pass this using the old
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index d79702a18..8a690c43d 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -2558,7 +2558,7 @@ add_newdoc('numpy.core.umath', 'matmul',
For other keyword-only arguments, see the
:ref:`ufunc docs <ufuncs.kwargs>`.
- ..versionchanged:: 1.16
+ ..versionadded:: 1.16
Now handles ufunc kwargs
Returns
@@ -2598,11 +2598,11 @@ add_newdoc('numpy.core.umath', 'matmul',
appending a 1 to its dimensions. After matrix multiplication
the appended 1 is removed.
- ``matmul`` differs from ``dot`` in two important ways.
+ ``matmul`` differs from ``dot`` in two important ways:
- Multiplication by scalars is not allowed, use ``*`` instead.
- Stacks of matrices are broadcast together as if the matrices
- were elements, respecting the signature `(n,k),(k,m)->(n,m)`:
+ were elements, respecting the signature ``(n,k),(k,m)->(n,m)``:
>>> a = a = np.full([9,5,7,3], True, dtype=bool)
>>> c = np.full([9, 5, 4,3], True, dtype=bool)
@@ -2639,13 +2639,13 @@ add_newdoc('numpy.core.umath', 'matmul',
Broadcasting is conventional for stacks of arrays
- >>> a = np.arange(2*2*4).reshape((2,2,4))
- >>> b = np.arange(2*2*4).reshape((2,4,2))
+ >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
+ >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
>>> np.matmul(a,b).shape
(2, 2, 2)
- >>> np.matmul(a,b)[0,1,1]
+ >>> np.matmul(a, b)[0, 1, 1]
98
- >>> sum(a[0,1,:] * b[0,:,1])
+ >>> sum(a[0, 1, :] * b[0 , :, 1])
98
Vector, vector returns the scalar inner product, but neither argument
@@ -2659,7 +2659,7 @@ add_newdoc('numpy.core.umath', 'matmul',
>>> np.matmul([1,2], 3)
Traceback (most recent call last):
...
- ValueError: Scalar operands are not allowed, use '*' instead
+ ValueError: matmul: Input operand 1 does not have enough dimensions ...
.. versionadded:: 1.10.0
""")
diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c
index 514297940..64a0569ad 100644
--- a/numpy/core/src/common/cblasfuncs.c
+++ b/numpy/core/src/common/cblasfuncs.c
@@ -201,7 +201,6 @@ _bad_strides(PyArrayObject *ap)
return 0;
}
-
/*
* dot(a,b)
* Returns the dot product of a and b for arrays of floating point types.
diff --git a/numpy/core/src/common/cblasfuncs.h b/numpy/core/src/common/cblasfuncs.h
index 78eff25a0..66ce4ca5b 100644
--- a/numpy/core/src/common/cblasfuncs.h
+++ b/numpy/core/src/common/cblasfuncs.h
@@ -3,7 +3,5 @@
NPY_NO_EXPORT PyObject *
cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
-NPY_NO_EXPORT int
-_bad_strides(PyArrayObject *ap);
#endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5ccb7f6d6..ce8af4392 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -75,6 +75,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
#include "umathmodule.h"
NPY_NO_EXPORT int initscalarmath(PyObject *);
+NPY_NO_EXPORT int set_matmul_flags(PyObject *d); /* in ufunc_object.c */
/*
* global variable to determine if legacy printing is enabled, accessible from
@@ -4505,7 +4506,6 @@ intern_strings(void)
npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
}
-
#if defined(NPY_PY3K)
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
@@ -4575,6 +4575,9 @@ PyMODINIT_FUNC init_multiarray_umath(void) {
goto err;
}
+ if (set_matmul_flags(d) < 0) {
+ goto err;
+ }
initialize_casting_tables();
initialize_numeric_types();
if(initscalarmath(m) < 0)
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 5f5bd3506..3ee0ec4f2 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -18,6 +18,7 @@
#include "npy_cblas.h"
#include "arraytypes.h" /* For TYPE_dot functions */
+
#include <assert.h>
/*
@@ -26,6 +27,36 @@
*****************************************************************************
*/
+/*
+ * -1 to be conservative, in case blas internally uses a for loop with an
+ * inclusive upper bound
+ */
+#define BLAS_MAXSIZE (NPY_MAX_INT - 1)
+
+/*
+ * Determine if a 2d matrix can be used by BLAS
+ * 1. Strides must not alias or overlap
+ * 2. The faster (second) axis must be contiguous
+ * 3. The slower (first) axis stride, in unit steps, must be larger than
+ * the faster axis dimension
+ */
+static NPY_INLINE npy_bool
+is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
+ npy_intp d1, npy_intp d2, npy_intp itemsize)
+{
+ npy_intp unit_stride1 = byte_stride1 / itemsize;
+ if (byte_stride2 != itemsize) {
+ return NPY_FALSE;
+ }
+ if ((byte_stride1 % itemsize ==0) &&
+ (unit_stride1 >= d2) &&
+ (unit_stride1 <= BLAS_MAXSIZE))
+ {
+ return NPY_TRUE;
+ }
+ return NPY_FALSE;
+}
+
#if defined(HAVE_CBLAS)
static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
static const npy_cfloat oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
@@ -34,76 +65,89 @@ static const npy_cfloat oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
*
* #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
* #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble#
- * #type = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
* #prefix = s, d, c, z#
* #step1 = 1.F, 1., &oneF, &oneD#
* #step0 = 0.F, 0., &zeroF, &zeroD#
*/
NPY_NO_EXPORT void
-@name@_gemv(void *ip1, npy_intp is1_m, void *ip2, npy_intp is2_n, void *op,
- npy_intp m, npy_intp n)
+@name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+ void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+ void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+ npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
{
/*
* Vector matrix multiplication -- Level 2 BLAS
* arguments
* ip1: contiguous data, m*n shape
* ip2: data in c order, n*1 shape
- * op: contiguous data in c order, m shape
+ * op: data in c order, m shape
*/
enum CBLAS_ORDER order;
- int lda;
+ int M, N, lda;
+
+ assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+ assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@)));
+ M = (int)m;
+ N = (int)n;
- if (is1_m == sizeof(@type@)) {
+ if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
order = CblasColMajor;
- lda = n;
+ lda = (int)(is1_m / sizeof(@typ@));
}
else {
/* If not ColMajor, caller should have ensured we are RowMajor */
/* will not assert in release mode */
- assert(is1_m == sizeof(@type@) * m);
order = CblasRowMajor;
- lda = m;
+ assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
+ lda = (int)(is1_n / sizeof(@typ@));
}
- cblas_@prefix@gemv(order, CblasTrans, n, m, @step1@, ip1, lda, ip2,
- is2_n / sizeof(@type@), @step0@, op, 1);
+ cblas_@prefix@gemv(order, CblasTrans, N, M, @step1@, ip1, lda, ip2,
+ is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@));
}
NPY_NO_EXPORT void
@name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
void *ip2, npy_intp is2_n, npy_intp is2_p,
- void *op, npy_intp m, npy_intp n, npy_intp p)
+ void *op, npy_intp os_m, npy_intp os_p,
+ npy_intp m, npy_intp n, npy_intp p)
{
/*
* matrix matrix multiplication -- Level 3 BLAS
*/
enum CBLAS_ORDER order = CblasRowMajor;
enum CBLAS_TRANSPOSE trans1, trans2;
- int M, N, P, lda, ldb;
- M = m;
- N = n;
- P = p;
+ int M, N, P, lda, ldb, ldc;
+ assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+ M = (int)m;
+ N = (int)n;
+ P = (int)p;
- if (is1_m == sizeof(@type@)) {
- trans1 = CblasTrans;
- lda = N > 1 ? is1_n / sizeof(@type@) : 1;
+ assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@)));
+ ldc = (int)(os_m / sizeof(@typ@));
+
+ if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
+ trans1 = CblasNoTrans;
+ lda = (int)(is1_m / sizeof(@typ@));
}
else {
/* If not ColMajor, caller should have ensured we are RowMajor */
/* will not assert in release mode */
- assert(is1_n == sizeof(@type@));
- trans1 = CblasNoTrans;
- lda = N > 1 ? is1_m / sizeof(@type@) : 1;
+ assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
+ trans1 = CblasTrans;
+ lda = (int)(is1_n / sizeof(@typ@));
}
- if (is2_n == sizeof(@type@)) {
- trans2 = CblasTrans;
- ldb = N > 1 ? is2_p / sizeof(@type@) : 1;
+
+ if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) {
+ trans2 = CblasNoTrans;
+ ldb = (int)(is2_n / sizeof(@typ@));
}
else {
/* If not ColMajor, caller should have ensured we are RowMajor */
/* will not assert in release mode */
- assert(is2_p == sizeof(@type@));
- trans2 = CblasNoTrans;
- ldb = P > 1 ? P : 1;
+ assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@)));
+ trans2 = CblasTrans;
+ ldb = (int)(is2_p / sizeof(@typ@));
}
/*
* Use syrk if we have a case of a matrix times its transpose.
@@ -119,23 +163,23 @@ NPY_NO_EXPORT void
npy_intp i,j;
if (trans1 == CblasNoTrans) {
cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
- ip1, lda, @step0@, op, P);
+ ip1, lda, @step0@, op, ldc);
}
else {
cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
- ip1, ldb, @step0@, op, P);
+ ip1, ldb, @step0@, op, ldc);
}
/* Copy the triangle */
for (i = 0; i < P; i++) {
for (j = i + 1; j < P; j++) {
- ((@type@*)op)[j * P + i] = ((@type@*)op)[i * P + j];
+ ((@typ@*)op)[j * ldc + i] = ((@typ@*)op)[i * ldc + j];
}
}
}
else {
cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda,
- ip2, ldb, @step0@, op, P);
+ ip2, ldb, @step0@, op, ldc);
}
}
@@ -148,94 +192,54 @@ NPY_NO_EXPORT void
*/
/**begin repeat
- * #TYPE = FLOAT, DOUBLE, HALF#
- * #typ = npy_float,npy_double,npy_half#
- * #SPECL = 0,0,1#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
- npy_intp dm, npy_intp dn, npy_intp dp,
- npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
- npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
- npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
- npy_intp os_p)
-{
- npy_intp m, n, p;
- for (m = 0; m < dm; m++) {
- for (p = 0; p < dp; p++) {
- /*
- * Use a double as an intermediate sum, which is natural for
- * npy_double, slightly increases the accuracy of npy_float,
- * and is perhaps overkill for npy_half.
- */
- double sum = 0;
- for (n = 0; n < dn; n++) {
-#if @SPECL@ == 1
- @typ@ val1 = (*(@typ@ *)ip1);
- @typ@ val2 = (*(@typ@ *)ip2);
- sum += npy_half_to_float(val1) * npy_half_to_float(val2);
-#else
- sum += ((double)*(@typ@ *)ip1) * ((double)*(@typ@ *)ip2);
-#endif
- ip2 += is2_n;
- ip1 += is1_n;
- }
-#if @SPECL@ == 1
- *(@typ@ *)op = npy_float_to_half((float)sum);
-#else
- /* in the case of double -> float, may produce INF */
- *(@typ@ *)op = (@typ@)sum;
-#endif
- ip1 -= ib1_n;
- ip2 -= ib2_n;
- op += os_p;
- ip2 += is2_p;
- }
- op -= ob_p;
- ip2 -= ib2_p;
- ip1 += is1_m;
- op += os_m;
- }
-}
-
-/**end repeat**/
-
-/**begin repeat
* #TYPE = LONGDOUBLE,
+ * FLOAT, DOUBLE, HALF,
* CFLOAT, CDOUBLE, CLONGDOUBLE,
* UBYTE, USHORT, UINT, ULONG, ULONGLONG,
* BYTE, SHORT, INT, LONG, LONGLONG,
* BOOL#
* #typ = npy_longdouble,
+ * npy_float,npy_double,npy_half,
* npy_cfloat, npy_cdouble, npy_clongdouble,
* npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
* npy_byte, npy_short, npy_int, npy_long, npy_longlong,
* npy_bool#
- * #IS_COMPLEX = 0, 1, 1, 1, 0*11#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11#
+ * #IS_HALF = 0, 0, 0, 1, 0*14#
*/
NPY_NO_EXPORT void
-@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
- npy_intp dm, npy_intp dn, npy_intp dp,
- npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
- npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
- npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
- npy_intp os_p)
+@TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+ void *_ip2, npy_intp is2_n, npy_intp is2_p,
+ void *_op, npy_intp os_m, npy_intp os_p,
+ npy_intp dm, npy_intp dn, npy_intp dp)
+
{
npy_intp m, n, p;
+ npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+ char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+ ib1_n = is1_n * dn;
+ ib2_n = is2_n * dn;
+ ib2_p = is2_p * dp;
+ ob_p = os_p * dp;
+
for (m = 0; m < dm; m++) {
for (p = 0; p < dp; p++) {
#if @IS_COMPLEX@ == 1
(*(@typ@ *)op).real = 0;
(*(@typ@ *)op).imag = 0;
+#elif @IS_HALF@
+ float sum = 0;
#else
*(@typ@ *)op = 0;
#endif
for (n = 0; n < dn; n++) {
@typ@ val1 = (*(@typ@ *)ip1);
@typ@ val2 = (*(@typ@ *)ip2);
-#if @IS_COMPLEX@ == 1
+#if @IS_HALF@
+ sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif @IS_COMPLEX@ == 1
(*(@typ@ *)op).real += (val1.real * val2.real) -
(val1.imag * val2.imag);
(*(@typ@ *)op).imag += (val1.real * val2.imag) +
@@ -246,6 +250,9 @@ NPY_NO_EXPORT void
ip2 += is2_n;
ip1 += is1_n;
}
+#if @IS_HALF@
+ *(@typ@ *)op = npy_float_to_half(sum);
+#endif
ip1 -= ib1_n;
ip2 -= ib2_n;
op += os_p;
@@ -271,10 +278,8 @@ NPY_NO_EXPORT void
* npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
* npy_byte, npy_short, npy_int, npy_long, npy_longlong,
* npy_bool#
- * #SPECL = 0, 0, 0, 2, 1, 1, 1, 0*11#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11#
* #USEBLAS = 1, 1, 0, 0, 1, 1, 0*12#
- * #chr = s, d, 0, 0, c, z, 0*12#
- * #blas_typ = npy_float, npy_double, void*16#
*/
@@ -291,62 +296,39 @@ NPY_NO_EXPORT void
npy_intp dp = dimensions[2];
npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
os_m=steps[4], os_p=steps[5];
- npy_intp ib1_n, ib2_n, ib2_p, ob_p;
-#if @USEBLAS@ & defined(HAVE_CBLAS)
+#if @USEBLAS@ && defined(HAVE_CBLAS)
+ npy_intp sz = sizeof(@typ@);
npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
- npy_bool scalar_out = (dm ==1 && dp == 1);
+ npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+ npy_bool scalar_out = (dm == 1 && dp == 1);
npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
- npy_bool too_big_for_blas = (dm > NPY_MAX_INT || dn > NPY_MAX_INT ||
- dp >= NPY_MAX_INT);
- npy_bool input_contiguous = ((is1_m == sizeof(@typ@) ||
- is1_n == sizeof(@typ@)) &&
- (is2_n == sizeof(@typ@) ||
- is2_p == sizeof(@typ@)));
- npy_bool vector_matrix = ((dm == 1) &&
- (is2_n == sizeof(@typ@) || (is2_n == sizeof(@typ@) * dp)));
- npy_bool matrix_vector = ((dp == 1) &&
- (is1_n == sizeof(@typ@) || (is1_n == sizeof(@typ@) * dm)));
+ npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+ dp > BLAS_MAXSIZE);
+ npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+ npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+ npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+ npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+ npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+ npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+ npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+ npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+ npy_bool vector_matrix = ((dm == 1) && i2blasable);
+ npy_bool matrix_vector = ((dp == 1) && i1blasable);
#endif
- ib1_n = is1_n*dn;
- ib2_n = is2_n*dn;
- ib2_p = is2_p*dp;
- ob_p = os_p *dp;
-
- if (dn == 0) {
- /* No operand, need to zero the output */
- for (iOuter = 0; iOuter < dOuter; iOuter++,
- args[0] += s0, args[1] += s1, args[2] += s2) {
- npy_intp m, p;
- char *op=args[2];
- for (m = 0; m < dm; m++) {
- for (p = 0; p < dp; p++) {
-#if @SPECL@ == 1
- (*(@typ@ *)op).real = 0;
- (*(@typ@ *)op).imag = 0;
-#else
- *(@typ@ *)op = 0;
-#endif
- op += os_p;
- }
- op += os_m - ob_p;
- }
- }
- return;
- }
for (iOuter = 0; iOuter < dOuter; iOuter++,
args[0] += s0, args[1] += s1, args[2] += s2) {
- char *ip1=args[0], *ip2=args[1], *op=args[2];
-#if @USEBLAS@ & defined(HAVE_CBLAS)
+ void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if @USEBLAS@ && defined(HAVE_CBLAS)
/*
* TODO: refactor this out to a inner_loop_selector, in
* PyUFunc_MatmulLoopSelector. But that call does not have access to
* n, m, p and strides.
*/
- if (too_big_for_blas) {
- @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
- ib1_n, ib2_n, ib2_p, ob_p,
- is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ if (too_big_for_blas || any_zero_dim) {
+ @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
+ ip2, is2_n, is2_p,
+ op, os_m, os_p, dm, dn, dp);
}
else if (special_case) {
/* Special case variants that have a 1 in the core dimensions */
@@ -355,44 +337,59 @@ NPY_NO_EXPORT void
@TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
} else if (scalar_vec){
/*
- * 0d @ vector or vector @ 0d
+ * 1,1d @ vector or vector @ 1,1d
* could use cblas_Xaxy, but that requires 0ing output
* and would not be faster (XXX prove it)
*/
- @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
- ib1_n, ib2_n, ib2_p, ob_p,
- is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
+ ip2, is2_n, is2_p,
+ op, os_m, os_p, dm, dn, dp);
} else if (vector_matrix) {
/* vector @ matrix, switch ip1, ip2, p and m */
- @TYPE@_gemv((void*)ip2, is2_n, (void*)ip1, is1_n,
- (void*)op, dp, dn);
+ @TYPE@_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+ op, os_p, os_m, dp, dn, dm);
} else if (matrix_vector) {
/* matrix @ vector */
- @TYPE@_gemv((void*)ip1, is1_n, (void*)ip2, is2_n,
- (void*)op, dm, dn);
+ @TYPE@_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+ op, os_m, os_p, dm, dn, dp);
} else {
- /* column @ row, 2d output, no blas needed or non-contiguous input */
- @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
- ib1_n, ib2_n, ib2_p, ob_p,
- is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ /* column @ row, 2d output, no blas needed or non-blas-able input */
+ @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
+ ip2, is2_n, is2_p,
+ op, os_m, os_p, dm, dn, dp);
}
} else {
/* matrix @ matrix */
- if (input_contiguous) {
- /* can only use blas if input is contiguous */
- @TYPE@_matmul_matrixmatrix((void*)ip1, is1_m, is1_n,
- (void*)ip2, is2_n, is2_p,
- (void*)op, dm, dn, dp);
+ if (i1blasable && i2blasable && o_c_blasable) {
+ @TYPE@_matmul_matrixmatrix(ip1, is1_m, is1_n,
+ ip2, is2_n, is2_p,
+ op, os_m, os_p,
+ dm, dn, dp);
+ } else if (i1blasable && i2blasable && o_f_blasable) {
+ /*
+ * Use transpose equivalence:
+ * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+ */
+ @TYPE@_matmul_matrixmatrix(ip2, is2_p, is2_n,
+ ip1, is1_n, is1_m,
+ op, os_p, os_m,
+ dp, dn, dm);
} else {
- @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
- ib1_n, ib2_n, ib2_p, ob_p,
- is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ /*
+ * If parameters are castable to int and we copy the
+ * non-blasable (or non-ccontiguous output)
+ * we could still use BLAS, see gh-12365.
+ */
+ @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
+ ip2, is2_n, is2_p,
+ op, os_m, os_p, dm, dn, dp);
}
}
#else
- @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
- ib1_n, ib2_n, ib2_p, ob_p,
- is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+ @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n,
+ ip2, is2_n, is2_p,
+ op, os_m, os_p, dm, dn, dp);
#endif
}
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 66f512f7b..5dc1db226 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -319,6 +319,23 @@ _find_array_prepare(ufunc_full_args args,
NPY_ITER_NO_BROADCAST | \
NPY_ITER_NO_SUBTYPE | \
NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
+
+/* Called at module initialization to set the matmul ufunc output flags */
+NPY_NO_EXPORT int
+set_matmul_flags(PyObject *d)
+{
+ PyObject *matmul = PyDict_GetItemString(d, "matmul");
+ if (matmul == NULL) {
+ return -1;
+ }
+ ((PyUFuncObject *)matmul)->op_flags[2] = (NPY_ITER_WRITEONLY |
+ NPY_ITER_UPDATEIFCOPY |
+ NPY_UFUNC_DEFAULT_OUTPUT_FLAGS) &
+ ~NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
+ return 0;
+}
+
+
/*
* Set per-operand flags according to desired input or output flags.
* op_flags[i] for i in input (as determined by ufunc->nin) will be
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index efd923972..ec60d9cfd 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -1304,89 +1304,6 @@ PyUFunc_MixedDivisionTypeResolver(PyUFuncObject *ufunc,
type_tup, out_dtypes);
}
-/*
- * XXX This is too restrictive, we should only check the inner axis involved
- * gh-12365
- */
-#define NOT_CONTIGUOUS(a) (PyArray_NDIM(a) >=2 && !( \
- PyArray_IS_C_CONTIGUOUS(a) || PyArray_IS_F_CONTIGUOUS(a)))
-
-/*
- * This function applies special type resolution rules for the case
- * where all the functions have the pattern XX->X, using
- * PyArray_ResultType instead of a linear search to get the best
- * loop, like PyUFunc_SimpleBinaryOperationTypeResolver, and adds
- * memory overlap and contiguity considerations to the operands,
- * possibly creating Writeback temporary data
- *
- * Returns 0 on success, -1 on error.
- */
-NPY_NO_EXPORT int
-PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
- NPY_CASTING casting,
- PyArrayObject **operands,
- PyObject *type_tup,
- PyArray_Descr **out_dtypes)
-{
-
- if (PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting, operands,
- type_tup, out_dtypes) < 0) {
- return -1;
- }
- if (PyArray_NDIM(operands[0]) >= 2 || PyArray_NDIM(operands[1]) >= 2) {
-#if defined(HAVE_CBLAS)
- int typenum = out_dtypes[2]->type_num;
- if ( (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
- NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
- /*
- * We are going to use BLAS
- * make sure 2d and more arrays are contiguous
- */
- if (_bad_strides(operands[0]) || NOT_CONTIGUOUS(operands[0])) {
- PyObject *op = PyArray_NewCopy(operands[0], NPY_ANYORDER);
-
- if (op == NULL) {
- return -1;
- }
- Py_DECREF(operands[0]);
- operands[0] = (PyArrayObject *)op;
- }
- if (_bad_strides(operands[1]) || NOT_CONTIGUOUS(operands[1])) {
- PyObject *op = PyArray_NewCopy(operands[1], NPY_ANYORDER);
-
- if (op == NULL) {
- return -1;
- }
- Py_DECREF(operands[1]);
- operands[1] = (PyArrayObject *)op;
- }
- }
-#endif
- if (operands[2] != NULL) {
- npy_intp last_stride = PyArray_STRIDE(operands[2],
- PyArray_NDIM(operands[2]) - 1);
- if (last_stride != PyArray_ITEMSIZE(operands[2])) {
- PyErr_SetString(PyExc_ValueError,
- "output array is not acceptable (must be C-contiguous)");
- return -1;
- }
- /*
- * Use all the flags in PyUFunc_GeneralizedFunction
- * except NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE and
- * NPY_ITER_WRITEONLY. Without NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
- * nditer will properly check overlap between output and any input.
- */
- ufunc->op_flags[2] = NPY_ITER_WRITEONLY |
- NPY_ITER_UPDATEIFCOPY |
- NPY_ITER_ALIGNED |
- NPY_ITER_ALLOCATE |
- NPY_ITER_NO_BROADCAST;
- }
- }
- return 0;
-}
-
-
static int
find_userloop(PyUFuncObject *ufunc,
PyArray_Descr **dtypes,
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index 5306a5983..2f37af753 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -145,11 +145,4 @@ PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
NpyAuxData **out_innerloopdata,
int *out_needs_api);
-NPY_NO_EXPORT int
-PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
- NPY_CASTING casting,
- PyArrayObject **operands,
- PyObject *type_tup,
- PyArray_Descr **out_dtypes);
-
#endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 80c69d9b3..fbf8ac0f5 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -5716,7 +5716,7 @@ class MatmulCommon(object):
res = self.matmul(v, v)
assert_(type(res) is np.dtype(dt).type)
- def test_0d_vector_values(self):
+ def test_scalar_output(self):
vec1 = np.array([2])
vec2 = np.array([3, 4]).reshape(1, -1)
tgt = np.array([6, 8])
@@ -5876,33 +5876,29 @@ class TestMatmul(MatmulCommon):
matmul = np.matmul
def test_out_arg(self):
- a = np.ones((2, 2), dtype=float)
- b = np.ones((2, 2), dtype=float)
- tgt = np.full((2,2), 2, dtype=float)
+ a = np.ones((5, 2), dtype=float)
+ b = np.array([[1, 3], [5, 7]], dtype=float)
+ tgt = np.dot(a, b)
# test as positional argument
msg = "out positional argument"
- out = np.zeros((2, 2), dtype=float)
+ out = np.zeros((5, 2), dtype=float)
self.matmul(a, b, out)
assert_array_equal(out, tgt, err_msg=msg)
# test as keyword argument
msg = "out keyword argument"
- out = np.zeros((2, 2), dtype=float)
+ out = np.zeros((5, 2), dtype=float)
self.matmul(a, b, out=out)
assert_array_equal(out, tgt, err_msg=msg)
# test out with not allowed type cast (safe casting)
msg = "Cannot cast ufunc matmul output"
- out = np.zeros((2, 2), dtype=np.int32)
+ out = np.zeros((5, 2), dtype=np.int32)
assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out)
- # cblas does not allow non-contiguous
- # outputs and consistency with dot would require same type,
- # dimensions, subtype, and c_contiguous.
-
# test out with type upcast to complex
- out = np.zeros((2, 2), dtype=np.complex128)
+ out = np.zeros((5, 2), dtype=np.complex128)
c = self.matmul(a, b, out=out)
assert_(c is out)
with suppress_warnings() as sup:
@@ -5910,15 +5906,39 @@ class TestMatmul(MatmulCommon):
c = c.astype(tgt.dtype)
assert_array_equal(c, tgt)
+ def test_out_contiguous(self):
+ a = np.ones((5, 2), dtype=float)
+ b = np.array([[1, 3], [5, 7]], dtype=float)
+ v = np.array([1, 3], dtype=float)
+ tgt = np.dot(a, b)
+ tgt_mv = np.dot(a, v)
+
# test out non-contiguous
- out = np.ones((2, 2, 2), dtype=float)
- assert_raises(ValueError, self.matmul, a, b, out=out[..., 0])
+ out = np.ones((5, 2, 2), dtype=float)
+ c = self.matmul(a, b, out=out[..., 0])
+ assert c.base is out
+ assert_array_equal(c, tgt)
+ c = self.matmul(a, v, out=out[:, 0, 0])
+ assert_array_equal(c, tgt_mv)
+ c = self.matmul(v, a.T, out=out[:, 0, 0])
+ assert_array_equal(c, tgt_mv)
+
+ # test out contiguous in only last dim
+ out = np.ones((10, 2), dtype=float)
+ c = self.matmul(a, b, out=out[::2, :])
+ assert_array_equal(c, tgt)
+
+ # test transposes of out, args
+ out = np.ones((5, 2), dtype=float)
+ c = self.matmul(b.T, a.T, out=out.T)
+ assert_array_equal(out, tgt)
m1 = np.arange(15.).reshape(5, 3)
m2 = np.arange(21.).reshape(3, 7)
m3 = np.arange(30.).reshape(5, 6)[:, ::2] # non-contiguous
vc = np.arange(10.)
vr = np.arange(6.)
+ m0 = np.zeros((3, 0))
@pytest.mark.parametrize('args', (
# matrix-matrix
(m1, m2), (m2.T, m1.T), (m2.T.copy(), m1.T), (m2.T, m1.T.copy()),
@@ -5934,12 +5954,19 @@ class TestMatmul(MatmulCommon):
# vector-matrix, matrix-vector, matrix non-contiguous
(m3, vr[:3]), (vc[:5], m3), (m3.T, vc[:5]), (vr[:3], m3.T),
# vector-matrix, matrix-vector, both non-contiguous
- (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T)))
+ (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T),
+ # size == 0
+ (m0, m0.T), (m0.T, m0), (m1, m0), (m0.T, m1.T),
+ ))
def test_dot_equivalent(self, args):
r1 = np.matmul(*args)
r2 = np.dot(*args)
assert_equal(r1, r2)
+ r3 = np.matmul(args[0].copy(), args[1].copy())
+ assert_equal(r1, r3)
+
+
if sys.version_info[:2] >= (3, 5):
class TestMatmulOperator(MatmulCommon):