diff options
author | mattip <matti.picus@gmail.com> | 2018-11-25 10:46:21 -0600 |
---|---|---|
committer | mattip <matti.picus@gmail.com> | 2018-12-01 16:13:59 -0800 |
commit | 72f2abf34a43cf3642acf6a6ff951148051404ae (patch) | |
tree | d6df709bea86487caf12595f81f6ca76e7e8a373 /numpy/core | |
parent | 8c9450a7fd69d5b74b47ffec60b5c235361daeff (diff) | |
download | numpy-72f2abf34a43cf3642acf6a6ff951148051404ae.tar.gz |
ENH: corrections and fixes from review
Diffstat (limited to 'numpy/core')
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 14 | ||||
-rw-r--r-- | numpy/core/code_generators/ufunc_docstrings.py | 16 | ||||
-rw-r--r-- | numpy/core/src/common/cblasfuncs.c | 1 | ||||
-rw-r--r-- | numpy/core/src/common/cblasfuncs.h | 2 | ||||
-rw-r--r-- | numpy/core/src/multiarray/multiarraymodule.c | 5 | ||||
-rw-r--r-- | numpy/core/src/umath/matmul.c.src | 327 | ||||
-rw-r--r-- | numpy/core/src/umath/ufunc_object.c | 17 | ||||
-rw-r--r-- | numpy/core/src/umath/ufunc_type_resolution.c | 83 | ||||
-rw-r--r-- | numpy/core/src/umath/ufunc_type_resolution.h | 7 | ||||
-rw-r--r-- | numpy/core/tests/test_multiarray.py | 57 |
10 files changed, 242 insertions, 287 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index a092cb95e..7522de2ad 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -136,7 +136,7 @@ class Ufunc(object): self.docstring = docstring self.typereso = typereso self.type_descriptions = [] - self.signature = kwargs.pop('signature', 'NULL') + self.signature = kwargs.pop('signature', None) for td in type_descriptions: self.type_descriptions.extend(td) for td in self.type_descriptions: @@ -911,9 +911,9 @@ defdict = { 'matmul' : Ufunc(2, 1, None, docstrings.get('numpy.core.umath.matmul'), - "PyUFunc_MatmulTypeResolver", + "PyUFunc_SimpleBinaryOperationTypeResolver", TD(notimes_or_obj), - signature='"(n?,k),(k,m?)->(n?,m?)"', + signature='(n?,k),(k,m?)->(n?,m?)', ), } @@ -1060,6 +1060,10 @@ def make_ufuncs(funcdict): # string literal in C code. We split at endlines because textwrap.wrap # do not play well with \n docstring = '\\n\"\"'.join(docstring.split(r"\n")) + if uf.signature is None: + sig = "NULL" + else: + sig = '"{}"'.format(uf.signature) fmt = textwrap.dedent("""\ identity = {identity_expr}; if ({has_identity} && identity == NULL) {{ @@ -1068,7 +1072,7 @@ def make_ufuncs(funcdict): f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity( {name}_functions, {name}_data, {name}_signatures, {nloops}, {nin}, {nout}, {identity}, "{name}", - "{doc}", 0, {signature}, identity + "{doc}", 0, {sig}, identity ); if ({has_identity}) {{ Py_DECREF(identity); @@ -1084,7 +1088,7 @@ def make_ufuncs(funcdict): identity='PyUFunc_IdentityValue', identity_expr=uf.identity, doc=docstring, - signature=uf.signature, + sig=sig, ) # Only PyUFunc_None means don't reorder - we pass this using the old diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py index d79702a18..8a690c43d 100644 --- a/numpy/core/code_generators/ufunc_docstrings.py +++ b/numpy/core/code_generators/ufunc_docstrings.py @@ -2558,7 +2558,7 @@ add_newdoc('numpy.core.umath', 'matmul', For other keyword-only arguments, see the :ref:`ufunc docs <ufuncs.kwargs>`. - ..versionchanged:: 1.16 + ..versionadded:: 1.16 Now handles ufunc kwargs Returns @@ -2598,11 +2598,11 @@ add_newdoc('numpy.core.umath', 'matmul', appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed. - ``matmul`` differs from ``dot`` in two important ways. + ``matmul`` differs from ``dot`` in two important ways: - Multiplication by scalars is not allowed, use ``*`` instead. - Stacks of matrices are broadcast together as if the matrices - were elements, respecting the signature `(n,k),(k,m)->(n,m)`: + were elements, respecting the signature ``(n,k),(k,m)->(n,m)``: >>> a = a = np.full([9,5,7,3], True, dtype=bool) >>> c = np.full([9, 5, 4,3], True, dtype=bool) @@ -2639,13 +2639,13 @@ add_newdoc('numpy.core.umath', 'matmul', Broadcasting is conventional for stacks of arrays - >>> a = np.arange(2*2*4).reshape((2,2,4)) - >>> b = np.arange(2*2*4).reshape((2,4,2)) + >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4)) + >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2)) >>> np.matmul(a,b).shape (2, 2, 2) - >>> np.matmul(a,b)[0,1,1] + >>> np.matmul(a, b)[0, 1, 1] 98 - >>> sum(a[0,1,:] * b[0,:,1]) + >>> sum(a[0, 1, :] * b[0 , :, 1]) 98 Vector, vector returns the scalar inner product, but neither argument @@ -2659,7 +2659,7 @@ add_newdoc('numpy.core.umath', 'matmul', >>> np.matmul([1,2], 3) Traceback (most recent call last): ... - ValueError: Scalar operands are not allowed, use '*' instead + ValueError: matmul: Input operand 1 does not have enough dimensions ... .. versionadded:: 1.10.0 """) diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c index 514297940..64a0569ad 100644 --- a/numpy/core/src/common/cblasfuncs.c +++ b/numpy/core/src/common/cblasfuncs.c @@ -201,7 +201,6 @@ _bad_strides(PyArrayObject *ap) return 0; } - /* * dot(a,b) * Returns the dot product of a and b for arrays of floating point types. diff --git a/numpy/core/src/common/cblasfuncs.h b/numpy/core/src/common/cblasfuncs.h index 78eff25a0..66ce4ca5b 100644 --- a/numpy/core/src/common/cblasfuncs.h +++ b/numpy/core/src/common/cblasfuncs.h @@ -3,7 +3,5 @@ NPY_NO_EXPORT PyObject * cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int -_bad_strides(PyArrayObject *ap); #endif diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 5ccb7f6d6..ce8af4392 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -75,6 +75,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "umathmodule.h" NPY_NO_EXPORT int initscalarmath(PyObject *); +NPY_NO_EXPORT int set_matmul_flags(PyObject *d); /* in ufunc_object.c */ /* * global variable to determine if legacy printing is enabled, accessible from @@ -4505,7 +4506,6 @@ intern_strings(void) npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2; } - #if defined(NPY_PY3K) static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, @@ -4575,6 +4575,9 @@ PyMODINIT_FUNC init_multiarray_umath(void) { goto err; } + if (set_matmul_flags(d) < 0) { + goto err; + } initialize_casting_tables(); initialize_numeric_types(); if(initscalarmath(m) < 0) diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src index 5f5bd3506..3ee0ec4f2 100644 --- a/numpy/core/src/umath/matmul.c.src +++ b/numpy/core/src/umath/matmul.c.src @@ -18,6 +18,7 @@ #include "npy_cblas.h" #include "arraytypes.h" /* For TYPE_dot functions */ + #include <assert.h> /* @@ -26,6 +27,36 @@ ***************************************************************************** */ +/* + * -1 to be conservative, in case blas internally uses a for loop with an + * inclusive upper bound + */ +#define BLAS_MAXSIZE (NPY_MAX_INT - 1) + +/* + * Determine if a 2d matrix can be used by BLAS + * 1. Strides must not alias or overlap + * 2. The faster (second) axis must be contiguous + * 3. The slower (first) axis stride, in unit steps, must be larger than + * the faster axis dimension + */ +static NPY_INLINE npy_bool +is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2, + npy_intp d1, npy_intp d2, npy_intp itemsize) +{ + npy_intp unit_stride1 = byte_stride1 / itemsize; + if (byte_stride2 != itemsize) { + return NPY_FALSE; + } + if ((byte_stride1 % itemsize ==0) && + (unit_stride1 >= d2) && + (unit_stride1 <= BLAS_MAXSIZE)) + { + return NPY_TRUE; + } + return NPY_FALSE; +} + #if defined(HAVE_CBLAS) static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0}; static const npy_cfloat oneF = {1.0, 0.0}, zeroF = {0.0, 0.0}; @@ -34,76 +65,89 @@ static const npy_cfloat oneF = {1.0, 0.0}, zeroF = {0.0, 0.0}; * * #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE# * #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble# - * #type = npy_float, npy_double, npy_cfloat, npy_cdouble# + * #typ = npy_float, npy_double, npy_cfloat, npy_cdouble# * #prefix = s, d, c, z# * #step1 = 1.F, 1., &oneF, &oneD# * #step0 = 0.F, 0., &zeroF, &zeroD# */ NPY_NO_EXPORT void -@name@_gemv(void *ip1, npy_intp is1_m, void *ip2, npy_intp is2_n, void *op, - npy_intp m, npy_intp n) +@name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n, + void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p), + void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p), + npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p)) { /* * Vector matrix multiplication -- Level 2 BLAS * arguments * ip1: contiguous data, m*n shape * ip2: data in c order, n*1 shape - * op: contiguous data in c order, m shape + * op: data in c order, m shape */ enum CBLAS_ORDER order; - int lda; + int M, N, lda; + + assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE); + assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@))); + M = (int)m; + N = (int)n; - if (is1_m == sizeof(@type@)) { + if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) { order = CblasColMajor; - lda = n; + lda = (int)(is1_m / sizeof(@typ@)); } else { /* If not ColMajor, caller should have ensured we are RowMajor */ /* will not assert in release mode */ - assert(is1_m == sizeof(@type@) * m); order = CblasRowMajor; - lda = m; + assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@))); + lda = (int)(is1_n / sizeof(@typ@)); } - cblas_@prefix@gemv(order, CblasTrans, n, m, @step1@, ip1, lda, ip2, - is2_n / sizeof(@type@), @step0@, op, 1); + cblas_@prefix@gemv(order, CblasTrans, N, M, @step1@, ip1, lda, ip2, + is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@)); } NPY_NO_EXPORT void @name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n, void *ip2, npy_intp is2_n, npy_intp is2_p, - void *op, npy_intp m, npy_intp n, npy_intp p) + void *op, npy_intp os_m, npy_intp os_p, + npy_intp m, npy_intp n, npy_intp p) { /* * matrix matrix multiplication -- Level 3 BLAS */ enum CBLAS_ORDER order = CblasRowMajor; enum CBLAS_TRANSPOSE trans1, trans2; - int M, N, P, lda, ldb; - M = m; - N = n; - P = p; + int M, N, P, lda, ldb, ldc; + assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE); + M = (int)m; + N = (int)n; + P = (int)p; - if (is1_m == sizeof(@type@)) { - trans1 = CblasTrans; - lda = N > 1 ? is1_n / sizeof(@type@) : 1; + assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@))); + ldc = (int)(os_m / sizeof(@typ@)); + + if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) { + trans1 = CblasNoTrans; + lda = (int)(is1_m / sizeof(@typ@)); } else { /* If not ColMajor, caller should have ensured we are RowMajor */ /* will not assert in release mode */ - assert(is1_n == sizeof(@type@)); - trans1 = CblasNoTrans; - lda = N > 1 ? is1_m / sizeof(@type@) : 1; + assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@))); + trans1 = CblasTrans; + lda = (int)(is1_n / sizeof(@typ@)); } - if (is2_n == sizeof(@type@)) { - trans2 = CblasTrans; - ldb = N > 1 ? is2_p / sizeof(@type@) : 1; + + if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) { + trans2 = CblasNoTrans; + ldb = (int)(is2_n / sizeof(@typ@)); } else { /* If not ColMajor, caller should have ensured we are RowMajor */ /* will not assert in release mode */ - assert(is2_p == sizeof(@type@)); - trans2 = CblasNoTrans; - ldb = P > 1 ? P : 1; + assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@))); + trans2 = CblasTrans; + ldb = (int)(is2_p / sizeof(@typ@)); } /* * Use syrk if we have a case of a matrix times its transpose. @@ -119,23 +163,23 @@ NPY_NO_EXPORT void npy_intp i,j; if (trans1 == CblasNoTrans) { cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@, - ip1, lda, @step0@, op, P); + ip1, lda, @step0@, op, ldc); } else { cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@, - ip1, ldb, @step0@, op, P); + ip1, ldb, @step0@, op, ldc); } /* Copy the triangle */ for (i = 0; i < P; i++) { for (j = i + 1; j < P; j++) { - ((@type@*)op)[j * P + i] = ((@type@*)op)[i * P + j]; + ((@typ@*)op)[j * ldc + i] = ((@typ@*)op)[i * ldc + j]; } } } else { cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda, - ip2, ldb, @step0@, op, P); + ip2, ldb, @step0@, op, ldc); } } @@ -148,94 +192,54 @@ NPY_NO_EXPORT void */ /**begin repeat - * #TYPE = FLOAT, DOUBLE, HALF# - * #typ = npy_float,npy_double,npy_half# - * #SPECL = 0,0,1# - */ - -NPY_NO_EXPORT void -@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op, - npy_intp dm, npy_intp dn, npy_intp dp, - npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p, - npy_intp ob_p, npy_intp is1_m, npy_intp is1_n, - npy_intp is2_n, npy_intp is2_p, npy_intp os_m, - npy_intp os_p) -{ - npy_intp m, n, p; - for (m = 0; m < dm; m++) { - for (p = 0; p < dp; p++) { - /* - * Use a double as an intermediate sum, which is natural for - * npy_double, slightly increases the accuracy of npy_float, - * and is perhaps overkill for npy_half. - */ - double sum = 0; - for (n = 0; n < dn; n++) { -#if @SPECL@ == 1 - @typ@ val1 = (*(@typ@ *)ip1); - @typ@ val2 = (*(@typ@ *)ip2); - sum += npy_half_to_float(val1) * npy_half_to_float(val2); -#else - sum += ((double)*(@typ@ *)ip1) * ((double)*(@typ@ *)ip2); -#endif - ip2 += is2_n; - ip1 += is1_n; - } -#if @SPECL@ == 1 - *(@typ@ *)op = npy_float_to_half((float)sum); -#else - /* in the case of double -> float, may produce INF */ - *(@typ@ *)op = (@typ@)sum; -#endif - ip1 -= ib1_n; - ip2 -= ib2_n; - op += os_p; - ip2 += is2_p; - } - op -= ob_p; - ip2 -= ib2_p; - ip1 += is1_m; - op += os_m; - } -} - -/**end repeat**/ - -/**begin repeat * #TYPE = LONGDOUBLE, + * FLOAT, DOUBLE, HALF, * CFLOAT, CDOUBLE, CLONGDOUBLE, * UBYTE, USHORT, UINT, ULONG, ULONGLONG, * BYTE, SHORT, INT, LONG, LONGLONG, * BOOL# * #typ = npy_longdouble, + * npy_float,npy_double,npy_half, * npy_cfloat, npy_cdouble, npy_clongdouble, * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, * npy_bool# - * #IS_COMPLEX = 0, 1, 1, 1, 0*11# + * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11# + * #IS_HALF = 0, 0, 0, 1, 0*14# */ NPY_NO_EXPORT void -@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op, - npy_intp dm, npy_intp dn, npy_intp dp, - npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p, - npy_intp ob_p, npy_intp is1_m, npy_intp is1_n, - npy_intp is2_n, npy_intp is2_p, npy_intp os_m, - npy_intp os_p) +@TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n, + void *_ip2, npy_intp is2_n, npy_intp is2_p, + void *_op, npy_intp os_m, npy_intp os_p, + npy_intp dm, npy_intp dn, npy_intp dp) + { npy_intp m, n, p; + npy_intp ib1_n, ib2_n, ib2_p, ob_p; + char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op; + + ib1_n = is1_n * dn; + ib2_n = is2_n * dn; + ib2_p = is2_p * dp; + ob_p = os_p * dp; + for (m = 0; m < dm; m++) { for (p = 0; p < dp; p++) { #if @IS_COMPLEX@ == 1 (*(@typ@ *)op).real = 0; (*(@typ@ *)op).imag = 0; +#elif @IS_HALF@ + float sum = 0; #else *(@typ@ *)op = 0; #endif for (n = 0; n < dn; n++) { @typ@ val1 = (*(@typ@ *)ip1); @typ@ val2 = (*(@typ@ *)ip2); -#if @IS_COMPLEX@ == 1 +#if @IS_HALF@ + sum += npy_half_to_float(val1) * npy_half_to_float(val2); +#elif @IS_COMPLEX@ == 1 (*(@typ@ *)op).real += (val1.real * val2.real) - (val1.imag * val2.imag); (*(@typ@ *)op).imag += (val1.real * val2.imag) + @@ -246,6 +250,9 @@ NPY_NO_EXPORT void ip2 += is2_n; ip1 += is1_n; } +#if @IS_HALF@ + *(@typ@ *)op = npy_float_to_half(sum); +#endif ip1 -= ib1_n; ip2 -= ib2_n; op += os_p; @@ -271,10 +278,8 @@ NPY_NO_EXPORT void * npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong, * npy_byte, npy_short, npy_int, npy_long, npy_longlong, * npy_bool# - * #SPECL = 0, 0, 0, 2, 1, 1, 1, 0*11# + * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11# * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*12# - * #chr = s, d, 0, 0, c, z, 0*12# - * #blas_typ = npy_float, npy_double, void*16# */ @@ -291,62 +296,39 @@ NPY_NO_EXPORT void npy_intp dp = dimensions[2]; npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3], os_m=steps[4], os_p=steps[5]; - npy_intp ib1_n, ib2_n, ib2_p, ob_p; -#if @USEBLAS@ & defined(HAVE_CBLAS) +#if @USEBLAS@ && defined(HAVE_CBLAS) + npy_intp sz = sizeof(@typ@); npy_bool special_case = (dm == 1 || dn == 1 || dp == 1); - npy_bool scalar_out = (dm ==1 && dp == 1); + npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0); + npy_bool scalar_out = (dm == 1 && dp == 1); npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1)); - npy_bool too_big_for_blas = (dm > NPY_MAX_INT || dn > NPY_MAX_INT || - dp >= NPY_MAX_INT); - npy_bool input_contiguous = ((is1_m == sizeof(@typ@) || - is1_n == sizeof(@typ@)) && - (is2_n == sizeof(@typ@) || - is2_p == sizeof(@typ@))); - npy_bool vector_matrix = ((dm == 1) && - (is2_n == sizeof(@typ@) || (is2_n == sizeof(@typ@) * dp))); - npy_bool matrix_vector = ((dp == 1) && - (is1_n == sizeof(@typ@) || (is1_n == sizeof(@typ@) * dm))); + npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE || + dp > BLAS_MAXSIZE); + npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz); + npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz); + npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz); + npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz); + npy_bool i1blasable = i1_c_blasable || i1_f_blasable; + npy_bool i2blasable = i2_c_blasable || i2_f_blasable; + npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz); + npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz); + npy_bool vector_matrix = ((dm == 1) && i2blasable); + npy_bool matrix_vector = ((dp == 1) && i1blasable); #endif - ib1_n = is1_n*dn; - ib2_n = is2_n*dn; - ib2_p = is2_p*dp; - ob_p = os_p *dp; - - if (dn == 0) { - /* No operand, need to zero the output */ - for (iOuter = 0; iOuter < dOuter; iOuter++, - args[0] += s0, args[1] += s1, args[2] += s2) { - npy_intp m, p; - char *op=args[2]; - for (m = 0; m < dm; m++) { - for (p = 0; p < dp; p++) { -#if @SPECL@ == 1 - (*(@typ@ *)op).real = 0; - (*(@typ@ *)op).imag = 0; -#else - *(@typ@ *)op = 0; -#endif - op += os_p; - } - op += os_m - ob_p; - } - } - return; - } for (iOuter = 0; iOuter < dOuter; iOuter++, args[0] += s0, args[1] += s1, args[2] += s2) { - char *ip1=args[0], *ip2=args[1], *op=args[2]; -#if @USEBLAS@ & defined(HAVE_CBLAS) + void *ip1=args[0], *ip2=args[1], *op=args[2]; +#if @USEBLAS@ && defined(HAVE_CBLAS) /* * TODO: refactor this out to a inner_loop_selector, in * PyUFunc_MatmulLoopSelector. But that call does not have access to * n, m, p and strides. */ - if (too_big_for_blas) { - @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp, - ib1_n, ib2_n, ib2_p, ob_p, - is1_m, is1_n, is2_n, is2_p, os_m, os_p); + if (too_big_for_blas || any_zero_dim) { + @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, + ip2, is2_n, is2_p, + op, os_m, os_p, dm, dn, dp); } else if (special_case) { /* Special case variants that have a 1 in the core dimensions */ @@ -355,44 +337,59 @@ NPY_NO_EXPORT void @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL); } else if (scalar_vec){ /* - * 0d @ vector or vector @ 0d + * 1,1d @ vector or vector @ 1,1d * could use cblas_Xaxy, but that requires 0ing output * and would not be faster (XXX prove it) */ - @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp, - ib1_n, ib2_n, ib2_p, ob_p, - is1_m, is1_n, is2_n, is2_p, os_m, os_p); + @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, + ip2, is2_n, is2_p, + op, os_m, os_p, dm, dn, dp); } else if (vector_matrix) { /* vector @ matrix, switch ip1, ip2, p and m */ - @TYPE@_gemv((void*)ip2, is2_n, (void*)ip1, is1_n, - (void*)op, dp, dn); + @TYPE@_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m, + op, os_p, os_m, dp, dn, dm); } else if (matrix_vector) { /* matrix @ vector */ - @TYPE@_gemv((void*)ip1, is1_n, (void*)ip2, is2_n, - (void*)op, dm, dn); + @TYPE@_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p, + + op, os_m, os_p, dm, dn, dp); } else { - /* column @ row, 2d output, no blas needed or non-contiguous input */ - @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp, - ib1_n, ib2_n, ib2_p, ob_p, - is1_m, is1_n, is2_n, is2_p, os_m, os_p); + /* column @ row, 2d output, no blas needed or non-blas-able input */ + @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, + ip2, is2_n, is2_p, + op, os_m, os_p, dm, dn, dp); } } else { /* matrix @ matrix */ - if (input_contiguous) { - /* can only use blas if input is contiguous */ - @TYPE@_matmul_matrixmatrix((void*)ip1, is1_m, is1_n, - (void*)ip2, is2_n, is2_p, - (void*)op, dm, dn, dp); + if (i1blasable && i2blasable && o_c_blasable) { + @TYPE@_matmul_matrixmatrix(ip1, is1_m, is1_n, + ip2, is2_n, is2_p, + op, os_m, os_p, + dm, dn, dp); + } else if (i1blasable && i2blasable && o_f_blasable) { + /* + * Use transpose equivalence: + * matmul(a, b, o) == matmul(b.T, a.T, o.T) + */ + @TYPE@_matmul_matrixmatrix(ip2, is2_p, is2_n, + ip1, is1_n, is1_m, + op, os_p, os_m, + dp, dn, dm); } else { - @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp, - ib1_n, ib2_n, ib2_p, ob_p, - is1_m, is1_n, is2_n, is2_p, os_m, os_p); + /* + * If parameters are castable to int and we copy the + * non-blasable (or non-ccontiguous output) + * we could still use BLAS, see gh-12365. + */ + @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, + ip2, is2_n, is2_p, + op, os_m, os_p, dm, dn, dp); } } #else - @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp, - ib1_n, ib2_n, ib2_p, ob_p, - is1_m, is1_n, is2_n, is2_p, os_m, os_p); + @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, + ip2, is2_n, is2_p, + op, os_m, os_p, dm, dn, dp); #endif } diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c index 66f512f7b..5dc1db226 100644 --- a/numpy/core/src/umath/ufunc_object.c +++ b/numpy/core/src/umath/ufunc_object.c @@ -319,6 +319,23 @@ _find_array_prepare(ufunc_full_args args, NPY_ITER_NO_BROADCAST | \ NPY_ITER_NO_SUBTYPE | \ NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE + +/* Called at module initialization to set the matmul ufunc output flags */ +NPY_NO_EXPORT int +set_matmul_flags(PyObject *d) +{ + PyObject *matmul = PyDict_GetItemString(d, "matmul"); + if (matmul == NULL) { + return -1; + } + ((PyUFuncObject *)matmul)->op_flags[2] = (NPY_ITER_WRITEONLY | + NPY_ITER_UPDATEIFCOPY | + NPY_UFUNC_DEFAULT_OUTPUT_FLAGS) & + ~NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE; + return 0; +} + + /* * Set per-operand flags according to desired input or output flags. * op_flags[i] for i in input (as determined by ufunc->nin) will be diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c index efd923972..ec60d9cfd 100644 --- a/numpy/core/src/umath/ufunc_type_resolution.c +++ b/numpy/core/src/umath/ufunc_type_resolution.c @@ -1304,89 +1304,6 @@ PyUFunc_MixedDivisionTypeResolver(PyUFuncObject *ufunc, type_tup, out_dtypes); } -/* - * XXX This is too restrictive, we should only check the inner axis involved - * gh-12365 - */ -#define NOT_CONTIGUOUS(a) (PyArray_NDIM(a) >=2 && !( \ - PyArray_IS_C_CONTIGUOUS(a) || PyArray_IS_F_CONTIGUOUS(a))) - -/* - * This function applies special type resolution rules for the case - * where all the functions have the pattern XX->X, using - * PyArray_ResultType instead of a linear search to get the best - * loop, like PyUFunc_SimpleBinaryOperationTypeResolver, and adds - * memory overlap and contiguity considerations to the operands, - * possibly creating Writeback temporary data - * - * Returns 0 on success, -1 on error. - */ -NPY_NO_EXPORT int -PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc, - NPY_CASTING casting, - PyArrayObject **operands, - PyObject *type_tup, - PyArray_Descr **out_dtypes) -{ - - if (PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting, operands, - type_tup, out_dtypes) < 0) { - return -1; - } - if (PyArray_NDIM(operands[0]) >= 2 || PyArray_NDIM(operands[1]) >= 2) { -#if defined(HAVE_CBLAS) - int typenum = out_dtypes[2]->type_num; - if ( (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum || - NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) { - /* - * We are going to use BLAS - * make sure 2d and more arrays are contiguous - */ - if (_bad_strides(operands[0]) || NOT_CONTIGUOUS(operands[0])) { - PyObject *op = PyArray_NewCopy(operands[0], NPY_ANYORDER); - - if (op == NULL) { - return -1; - } - Py_DECREF(operands[0]); - operands[0] = (PyArrayObject *)op; - } - if (_bad_strides(operands[1]) || NOT_CONTIGUOUS(operands[1])) { - PyObject *op = PyArray_NewCopy(operands[1], NPY_ANYORDER); - - if (op == NULL) { - return -1; - } - Py_DECREF(operands[1]); - operands[1] = (PyArrayObject *)op; - } - } -#endif - if (operands[2] != NULL) { - npy_intp last_stride = PyArray_STRIDE(operands[2], - PyArray_NDIM(operands[2]) - 1); - if (last_stride != PyArray_ITEMSIZE(operands[2])) { - PyErr_SetString(PyExc_ValueError, - "output array is not acceptable (must be C-contiguous)"); - return -1; - } - /* - * Use all the flags in PyUFunc_GeneralizedFunction - * except NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE and - * NPY_ITER_WRITEONLY. Without NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE - * nditer will properly check overlap between output and any input. - */ - ufunc->op_flags[2] = NPY_ITER_WRITEONLY | - NPY_ITER_UPDATEIFCOPY | - NPY_ITER_ALIGNED | - NPY_ITER_ALLOCATE | - NPY_ITER_NO_BROADCAST; - } - } - return 0; -} - - static int find_userloop(PyUFuncObject *ufunc, PyArray_Descr **dtypes, diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h index 5306a5983..2f37af753 100644 --- a/numpy/core/src/umath/ufunc_type_resolution.h +++ b/numpy/core/src/umath/ufunc_type_resolution.h @@ -145,11 +145,4 @@ PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc, NpyAuxData **out_innerloopdata, int *out_needs_api); -NPY_NO_EXPORT int -PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc, - NPY_CASTING casting, - PyArrayObject **operands, - PyObject *type_tup, - PyArray_Descr **out_dtypes); - #endif diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py index 80c69d9b3..fbf8ac0f5 100644 --- a/numpy/core/tests/test_multiarray.py +++ b/numpy/core/tests/test_multiarray.py @@ -5716,7 +5716,7 @@ class MatmulCommon(object): res = self.matmul(v, v) assert_(type(res) is np.dtype(dt).type) - def test_0d_vector_values(self): + def test_scalar_output(self): vec1 = np.array([2]) vec2 = np.array([3, 4]).reshape(1, -1) tgt = np.array([6, 8]) @@ -5876,33 +5876,29 @@ class TestMatmul(MatmulCommon): matmul = np.matmul def test_out_arg(self): - a = np.ones((2, 2), dtype=float) - b = np.ones((2, 2), dtype=float) - tgt = np.full((2,2), 2, dtype=float) + a = np.ones((5, 2), dtype=float) + b = np.array([[1, 3], [5, 7]], dtype=float) + tgt = np.dot(a, b) # test as positional argument msg = "out positional argument" - out = np.zeros((2, 2), dtype=float) + out = np.zeros((5, 2), dtype=float) self.matmul(a, b, out) assert_array_equal(out, tgt, err_msg=msg) # test as keyword argument msg = "out keyword argument" - out = np.zeros((2, 2), dtype=float) + out = np.zeros((5, 2), dtype=float) self.matmul(a, b, out=out) assert_array_equal(out, tgt, err_msg=msg) # test out with not allowed type cast (safe casting) msg = "Cannot cast ufunc matmul output" - out = np.zeros((2, 2), dtype=np.int32) + out = np.zeros((5, 2), dtype=np.int32) assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out) - # cblas does not allow non-contiguous - # outputs and consistency with dot would require same type, - # dimensions, subtype, and c_contiguous. - # test out with type upcast to complex - out = np.zeros((2, 2), dtype=np.complex128) + out = np.zeros((5, 2), dtype=np.complex128) c = self.matmul(a, b, out=out) assert_(c is out) with suppress_warnings() as sup: @@ -5910,15 +5906,39 @@ class TestMatmul(MatmulCommon): c = c.astype(tgt.dtype) assert_array_equal(c, tgt) + def test_out_contiguous(self): + a = np.ones((5, 2), dtype=float) + b = np.array([[1, 3], [5, 7]], dtype=float) + v = np.array([1, 3], dtype=float) + tgt = np.dot(a, b) + tgt_mv = np.dot(a, v) + # test out non-contiguous - out = np.ones((2, 2, 2), dtype=float) - assert_raises(ValueError, self.matmul, a, b, out=out[..., 0]) + out = np.ones((5, 2, 2), dtype=float) + c = self.matmul(a, b, out=out[..., 0]) + assert c.base is out + assert_array_equal(c, tgt) + c = self.matmul(a, v, out=out[:, 0, 0]) + assert_array_equal(c, tgt_mv) + c = self.matmul(v, a.T, out=out[:, 0, 0]) + assert_array_equal(c, tgt_mv) + + # test out contiguous in only last dim + out = np.ones((10, 2), dtype=float) + c = self.matmul(a, b, out=out[::2, :]) + assert_array_equal(c, tgt) + + # test transposes of out, args + out = np.ones((5, 2), dtype=float) + c = self.matmul(b.T, a.T, out=out.T) + assert_array_equal(out, tgt) m1 = np.arange(15.).reshape(5, 3) m2 = np.arange(21.).reshape(3, 7) m3 = np.arange(30.).reshape(5, 6)[:, ::2] # non-contiguous vc = np.arange(10.) vr = np.arange(6.) + m0 = np.zeros((3, 0)) @pytest.mark.parametrize('args', ( # matrix-matrix (m1, m2), (m2.T, m1.T), (m2.T.copy(), m1.T), (m2.T, m1.T.copy()), @@ -5934,12 +5954,19 @@ class TestMatmul(MatmulCommon): # vector-matrix, matrix-vector, matrix non-contiguous (m3, vr[:3]), (vc[:5], m3), (m3.T, vc[:5]), (vr[:3], m3.T), # vector-matrix, matrix-vector, both non-contiguous - (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T))) + (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T), + # size == 0 + (m0, m0.T), (m0.T, m0), (m1, m0), (m0.T, m1.T), + )) def test_dot_equivalent(self, args): r1 = np.matmul(*args) r2 = np.dot(*args) assert_equal(r1, r2) + r3 = np.matmul(args[0].copy(), args[1].copy()) + assert_equal(r1, r3) + + if sys.version_info[:2] >= (3, 5): class TestMatmulOperator(MatmulCommon): |