ENH: corrections and fixes from review

author: mattip <matti.picus@gmail.com> 2018-11-25 10:46:21 -0600
committer: mattip <matti.picus@gmail.com> 2018-12-01 16:13:59 -0800
commit: 72f2abf34a43cf3642acf6a6ff951148051404ae (patch)
tree: d6df709bea86487caf12595f81f6ca76e7e8a373 /numpy/core
parent: 8c9450a7fd69d5b74b47ffec60b5c235361daeff (diff)
download: numpy-72f2abf34a43cf3642acf6a6ff951148051404ae.tar.gz
10 files changed, 242 insertions, 287 deletions
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index a092cb95e..7522de2ad 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -136,7 +136,7 @@ class Ufunc(object):
         self.docstring = docstring
         self.typereso = typereso
         self.type_descriptions = []
-        self.signature = kwargs.pop('signature', 'NULL')
+        self.signature = kwargs.pop('signature', None)
         for td in type_descriptions:
             self.type_descriptions.extend(td)
         for td in self.type_descriptions:
@@ -911,9 +911,9 @@ defdict = {
 'matmul' :
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.matmul'),
-          "PyUFunc_MatmulTypeResolver",
+          "PyUFunc_SimpleBinaryOperationTypeResolver",
           TD(notimes_or_obj),
-          signature='"(n?,k),(k,m?)->(n?,m?)"',
+          signature='(n?,k),(k,m?)->(n?,m?)',
           ),
 }
 
@@ -1060,6 +1060,10 @@ def make_ufuncs(funcdict):
         # string literal in C code. We split at endlines because textwrap.wrap
         # do not play well with \n
         docstring = '\\n\"\"'.join(docstring.split(r"\n"))
+        if uf.signature is None:
+            sig = "NULL"
+        else:
+            sig = '"{}"'.format(uf.signature)
         fmt = textwrap.dedent("""\
             identity = {identity_expr};
             if ({has_identity} && identity == NULL) {{
@@ -1068,7 +1072,7 @@ def make_ufuncs(funcdict):
             f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
                 {name}_functions, {name}_data, {name}_signatures, {nloops},
                 {nin}, {nout}, {identity}, "{name}",
-                "{doc}", 0, {signature}, identity
+                "{doc}", 0, {sig}, identity
             );
             if ({has_identity}) {{
                 Py_DECREF(identity);
@@ -1084,7 +1088,7 @@ def make_ufuncs(funcdict):
             identity='PyUFunc_IdentityValue',
             identity_expr=uf.identity,
             doc=docstring,
-            signature=uf.signature,
+            sig=sig,
         )
 
         # Only PyUFunc_None means don't reorder - we pass this using the old
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index d79702a18..8a690c43d 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -2558,7 +2558,7 @@ add_newdoc('numpy.core.umath', 'matmul',
         For other keyword-only arguments, see the
         :ref:`ufunc docs <ufuncs.kwargs>`.
 
-        ..versionchanged:: 1.16
+        ..versionadded:: 1.16
           Now handles ufunc kwargs
 
     Returns
@@ -2598,11 +2598,11 @@ add_newdoc('numpy.core.umath', 'matmul',
       appending a 1 to its dimensions. After matrix multiplication
       the appended 1 is removed.
 
-    ``matmul`` differs from ``dot`` in two important ways.
+    ``matmul`` differs from ``dot`` in two important ways:
 
     - Multiplication by scalars is not allowed, use ``*`` instead.
     - Stacks of matrices are broadcast together as if the matrices
-      were elements, respecting the signature `(n,k),(k,m)->(n,m)`:
+      were elements, respecting the signature ``(n,k),(k,m)->(n,m)``:
 
       >>> a = a = np.full([9,5,7,3], True, dtype=bool)
       >>> c = np.full([9, 5, 4,3], True, dtype=bool)
@@ -2639,13 +2639,13 @@ add_newdoc('numpy.core.umath', 'matmul',
 
     Broadcasting is conventional for stacks of arrays
 
-    >>> a = np.arange(2*2*4).reshape((2,2,4))
-    >>> b = np.arange(2*2*4).reshape((2,4,2))
+    >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
+    >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
     >>> np.matmul(a,b).shape
     (2, 2, 2)
-    >>> np.matmul(a,b)[0,1,1]
+    >>> np.matmul(a, b)[0, 1, 1]
     98
-    >>> sum(a[0,1,:] * b[0,:,1])
+    >>> sum(a[0, 1, :] * b[0 , :, 1])
     98
 
     Vector, vector returns the scalar inner product, but neither argument
@@ -2659,7 +2659,7 @@ add_newdoc('numpy.core.umath', 'matmul',
     >>> np.matmul([1,2], 3)
     Traceback (most recent call last):
     ...
-    ValueError: Scalar operands are not allowed, use '*' instead
+    ValueError: matmul: Input operand 1 does not have enough dimensions ...
 
     .. versionadded:: 1.10.0
     """)
diff --git a/numpy/core/src/common/cblasfuncs.c b/numpy/core/src/common/cblasfuncs.c
index 514297940..64a0569ad 100644
--- a/numpy/core/src/common/cblasfuncs.c
+++ b/numpy/core/src/common/cblasfuncs.c
@@ -201,7 +201,6 @@ _bad_strides(PyArrayObject *ap)
     return 0;
 }
 
-
 /*
  * dot(a,b)
  * Returns the dot product of a and b for arrays of floating point types.
diff --git a/numpy/core/src/common/cblasfuncs.h b/numpy/core/src/common/cblasfuncs.h
index 78eff25a0..66ce4ca5b 100644
--- a/numpy/core/src/common/cblasfuncs.h
+++ b/numpy/core/src/common/cblasfuncs.h
@@ -3,7 +3,5 @@
 
 NPY_NO_EXPORT PyObject *
 cblas_matrixproduct(int, PyArrayObject *, PyArrayObject *, PyArrayObject *);
-NPY_NO_EXPORT int
-_bad_strides(PyArrayObject *ap);
 
 #endif
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index 5ccb7f6d6..ce8af4392 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -75,6 +75,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 #include "umathmodule.h"
 
 NPY_NO_EXPORT int initscalarmath(PyObject *);
+NPY_NO_EXPORT int set_matmul_flags(PyObject *d); /* in ufunc_object.c */
 
 /*
  * global variable to determine if legacy printing is enabled, accessible from
@@ -4505,7 +4506,6 @@ intern_strings(void)
            npy_ma_str_ndmin && npy_ma_str_axis1 && npy_ma_str_axis2;
 }
 
-
 #if defined(NPY_PY3K)
 static struct PyModuleDef moduledef = {
         PyModuleDef_HEAD_INIT,
@@ -4575,6 +4575,9 @@ PyMODINIT_FUNC init_multiarray_umath(void) {
         goto err;
     }
 
+    if (set_matmul_flags(d) < 0) {
+        goto err;
+    }
     initialize_casting_tables();
     initialize_numeric_types();
     if(initscalarmath(m) < 0)
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index 5f5bd3506..3ee0ec4f2 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -18,6 +18,7 @@
 
 #include "npy_cblas.h"
 #include "arraytypes.h" /* For TYPE_dot functions */
+
 #include <assert.h>
 
 /*
@@ -26,6 +27,36 @@
  *****************************************************************************
  */
 
+/*
+ * -1 to be conservative, in case blas internally uses a for loop with an
+ * inclusive upper bound
+ */
+#define BLAS_MAXSIZE (NPY_MAX_INT - 1)
+
+/*
+ * Determine if a 2d matrix can be used by BLAS
+ * 1. Strides must not alias or overlap
+ * 2. The faster (second) axis must be contiguous
+ * 3. The slower (first) axis stride, in unit steps, must be larger than
+ *    the faster axis dimension
+ */
+static NPY_INLINE npy_bool
+is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
+              npy_intp d1, npy_intp d2,  npy_intp itemsize)
+{
+    npy_intp unit_stride1 = byte_stride1 / itemsize;
+    if (byte_stride2 != itemsize) {
+        return NPY_FALSE;
+    }
+    if ((byte_stride1 % itemsize ==0) &&
+        (unit_stride1 >= d2) &&
+        (unit_stride1 <= BLAS_MAXSIZE))
+    {
+        return NPY_TRUE;
+    }
+    return NPY_FALSE;
+}
+
 #if defined(HAVE_CBLAS)
 static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
 static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
@@ -34,76 +65,89 @@ static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
  *
  * #name = FLOAT, DOUBLE, CFLOAT, CDOUBLE#
  * #ctype = npy_float, npy_double, npy_cfloat, npy_cdouble#
- * #type = npy_float, npy_double, npy_cfloat, npy_cdouble#
+ * #typ = npy_float, npy_double, npy_cfloat, npy_cdouble#
  * #prefix = s, d, c, z#
  * #step1 = 1.F, 1., &oneF, &oneD#
  * #step0 = 0.F, 0., &zeroF, &zeroD#
  */
 NPY_NO_EXPORT void
-@name@_gemv(void *ip1, npy_intp is1_m, void *ip2, npy_intp is2_n, void *op,
-               npy_intp m, npy_intp n)
+@name@_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
 {
     /*
      * Vector matrix multiplication -- Level 2 BLAS
      * arguments
      * ip1: contiguous data, m*n shape
      * ip2: data in c order, n*1 shape
-     * op:  contiguous data in c order, m shape
+     * op:  data in c order, m shape
      */
     enum CBLAS_ORDER order;
-    int lda;
+    int M, N, lda;
+
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+    assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@)));
+    M = (int)m;
+    N = (int)n;
 
-    if (is1_m == sizeof(@type@)) {
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
         order = CblasColMajor;
-        lda = n;
+        lda = (int)(is1_m / sizeof(@typ@));
     }
     else {
         /* If not ColMajor, caller should have ensured we are RowMajor */
         /* will not assert in release mode */
-        assert(is1_m == sizeof(@type@) * m);
         order = CblasRowMajor;
-        lda = m;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
+        lda = (int)(is1_n / sizeof(@typ@));
     }
-    cblas_@prefix@gemv(order, CblasTrans, n, m, @step1@, ip1, lda, ip2,
-                                     is2_n / sizeof(@type@), @step0@, op, 1);
+    cblas_@prefix@gemv(order, CblasTrans, N, M, @step1@, ip1, lda, ip2,
+                                     is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@));
 }
 
 NPY_NO_EXPORT void
 @name@_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
                            void *ip2, npy_intp is2_n, npy_intp is2_p,
-                           void *op, npy_intp m, npy_intp n, npy_intp p)
+                           void *op, npy_intp os_m, npy_intp os_p,
+                           npy_intp m, npy_intp n, npy_intp p)
 {
     /*
      * matrix matrix multiplication -- Level 3 BLAS
      */
     enum CBLAS_ORDER order = CblasRowMajor;
     enum CBLAS_TRANSPOSE trans1, trans2;
-    int M, N, P, lda, ldb;
-    M = m;
-    N = n;
-    P = p;
+    int M, N, P, lda, ldb, ldc;
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+    M = (int)m;
+    N = (int)n;
+    P = (int)p;
 
-    if (is1_m == sizeof(@type@)) {
-        trans1 = CblasTrans;
-        lda = N > 1 ? is1_n / sizeof(@type@) : 1;
+    assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@)));
+    ldc = (int)(os_m / sizeof(@typ@));
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
+        trans1 = CblasNoTrans;
+        lda = (int)(is1_m / sizeof(@typ@));
     }
     else {
         /* If not ColMajor, caller should have ensured we are RowMajor */
         /* will not assert in release mode */
-        assert(is1_n == sizeof(@type@));
-        trans1 = CblasNoTrans;
-        lda = N > 1 ? is1_m / sizeof(@type@) : 1;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
+        trans1 = CblasTrans;
+        lda = (int)(is1_n / sizeof(@typ@));
     }
-    if (is2_n == sizeof(@type@)) {
-        trans2 = CblasTrans;
-        ldb = N > 1 ? is2_p / sizeof(@type@) : 1;
+
+    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) {
+        trans2 = CblasNoTrans;
+        ldb = (int)(is2_n / sizeof(@typ@));
     }
     else {
         /* If not ColMajor, caller should have ensured we are RowMajor */
         /* will not assert in release mode */
-        assert(is2_p == sizeof(@type@));
-        trans2 = CblasNoTrans;
-        ldb = P > 1 ? P : 1;
+        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@)));
+        trans2 = CblasTrans;
+        ldb = (int)(is2_p / sizeof(@typ@));
     }
     /*
      * Use syrk if we have a case of a matrix times its transpose.
@@ -119,23 +163,23 @@ NPY_NO_EXPORT void
         npy_intp i,j;
         if (trans1 == CblasNoTrans) {
             cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
-                               ip1, lda, @step0@, op, P);
+                               ip1, lda, @step0@, op, ldc);
         }
         else {
             cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
-                               ip1, ldb, @step0@, op, P);
+                               ip1, ldb, @step0@, op, ldc);
         }
         /* Copy the triangle */
         for (i = 0; i < P; i++) {
             for (j = i + 1; j < P; j++) {
-                ((@type@*)op)[j * P + i] = ((@type@*)op)[i * P + j];
+                ((@typ@*)op)[j * ldc + i] = ((@typ@*)op)[i * ldc + j];
             }
         }
 
     }
     else {
         cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda,
-                           ip2, ldb, @step0@, op, P);
+                           ip2, ldb, @step0@, op, ldc);
     }
 }
 
@@ -148,94 +192,54 @@ NPY_NO_EXPORT void
  */
 
 /**begin repeat
- * #TYPE = FLOAT, DOUBLE, HALF#
- * #typ = npy_float,npy_double,npy_half#
- * #SPECL = 0,0,1#
- */
-
-NPY_NO_EXPORT void
-@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
-                           npy_intp dm, npy_intp dn, npy_intp dp,
-                           npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
-                           npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
-                           npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
-                           npy_intp os_p)
-{
-    npy_intp m, n, p;
-    for (m = 0; m < dm; m++) {
-        for (p = 0; p < dp; p++) {
-            /*
-             * Use a double as an intermediate sum, which is natural for
-             * npy_double, slightly increases the accuracy of npy_float,
-             * and is perhaps overkill for npy_half.
-             */
-            double sum = 0;
-            for (n = 0; n < dn; n++) {
-#if @SPECL@ == 1
-                @typ@ val1 = (*(@typ@ *)ip1);
-                @typ@ val2 = (*(@typ@ *)ip2);
-                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
-#else
-                sum += ((double)*(@typ@ *)ip1) * ((double)*(@typ@ *)ip2);
-#endif
-                ip2 += is2_n;
-                ip1 += is1_n;
-            }
-#if @SPECL@ == 1
-            *(@typ@ *)op = npy_float_to_half((float)sum);
-#else
-            /* in the case of double -> float, may produce INF */
-            *(@typ@ *)op = (@typ@)sum;
-#endif
-            ip1 -= ib1_n;
-            ip2 -= ib2_n;
-            op  +=  os_p;
-            ip2 += is2_p;
-        }
-        op -= ob_p;
-        ip2 -= ib2_p;
-        ip1 += is1_m;
-        op  +=  os_m;
-    }
-}
-
-/**end repeat**/
-
-/**begin repeat
  *  #TYPE = LONGDOUBLE,
+ *          FLOAT, DOUBLE, HALF,
  *          CFLOAT, CDOUBLE, CLONGDOUBLE,
  *          UBYTE, USHORT, UINT, ULONG, ULONGLONG,
  *          BYTE, SHORT, INT, LONG, LONGLONG,
  *          BOOL#
  *  #typ = npy_longdouble,
+ *         npy_float,npy_double,npy_half,
  *         npy_cfloat, npy_cdouble, npy_clongdouble,
  *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
  *         npy_bool#
- * #IS_COMPLEX = 0, 1, 1, 1, 0*11#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11#
+ * #IS_HALF = 0, 0, 0, 1, 0*14#
  */
 
 NPY_NO_EXPORT void
-@TYPE@_matmul_inner_noblas(char *ip1, char *ip2, char *op,
-                           npy_intp dm, npy_intp dn, npy_intp dp,
-                           npy_intp ib1_n, npy_intp ib2_n, npy_intp ib2_p,
-                           npy_intp ob_p, npy_intp is1_m, npy_intp is1_n,
-                           npy_intp is2_n, npy_intp is2_p, npy_intp os_m,
-                           npy_intp os_p)
+@TYPE@_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
 {
     npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
     for (m = 0; m < dm; m++) {
         for (p = 0; p < dp; p++) {
 #if @IS_COMPLEX@ == 1
             (*(@typ@ *)op).real = 0;
             (*(@typ@ *)op).imag = 0;
+#elif @IS_HALF@
+            float sum = 0;
 #else
             *(@typ@ *)op = 0;
 #endif
             for (n = 0; n < dn; n++) {
                 @typ@ val1 = (*(@typ@ *)ip1);
                 @typ@ val2 = (*(@typ@ *)ip2);
-#if @IS_COMPLEX@ == 1
+#if @IS_HALF@
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif @IS_COMPLEX@ == 1
                 (*(@typ@ *)op).real += (val1.real * val2.real) -
                                        (val1.imag * val2.imag);
                 (*(@typ@ *)op).imag += (val1.real * val2.imag) +
@@ -246,6 +250,9 @@ NPY_NO_EXPORT void
                 ip2 += is2_n;
                 ip1 += is1_n;
             }
+#if @IS_HALF@
+            *(@typ@ *)op = npy_float_to_half(sum);
+#endif
             ip1 -= ib1_n;
             ip2 -= ib2_n;
             op  +=  os_p;
@@ -271,10 +278,8 @@ NPY_NO_EXPORT void
  *         npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong,
  *         npy_byte, npy_short, npy_int, npy_long, npy_longlong,
  *         npy_bool#
- * #SPECL = 0, 0, 0, 2, 1, 1, 1, 0*11#
+ * #IS_COMPLEX = 0, 0, 0, 0, 1, 1, 1, 0*11#
  * #USEBLAS = 1, 1, 0, 0, 1, 1, 0*12#
- * #chr = s, d, 0, 0, c, z, 0*12#
- * #blas_typ = npy_float, npy_double, void*16#
  */
 
 
@@ -291,62 +296,39 @@ NPY_NO_EXPORT void
     npy_intp dp = dimensions[2];
     npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
          os_m=steps[4], os_p=steps[5];
-    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
-#if @USEBLAS@ & defined(HAVE_CBLAS)
+#if @USEBLAS@ && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(@typ@);
     npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
-    npy_bool scalar_out = (dm ==1 && dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
     npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
-    npy_bool too_big_for_blas = (dm > NPY_MAX_INT || dn > NPY_MAX_INT ||
-                                 dp >= NPY_MAX_INT);
-    npy_bool input_contiguous = ((is1_m == sizeof(@typ@) ||
-                                  is1_n == sizeof(@typ@)) &&
-                                 (is2_n == sizeof(@typ@) ||
-                                  is2_p == sizeof(@typ@)));
-    npy_bool vector_matrix = ((dm == 1) &&
-                   (is2_n == sizeof(@typ@) || (is2_n == sizeof(@typ@) * dp)));
-    npy_bool matrix_vector = ((dp == 1)  &&
-                   (is1_n == sizeof(@typ@) || (is1_n == sizeof(@typ@) * dm)));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable);
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable);
 #endif
 
-    ib1_n = is1_n*dn;
-    ib2_n = is2_n*dn;
-    ib2_p = is2_p*dp;
-    ob_p  = os_p *dp;
-
-    if (dn == 0) {
-        /* No operand, need to zero the output */
-        for (iOuter = 0; iOuter < dOuter; iOuter++,
-                        args[0] += s0, args[1] += s1, args[2] += s2) {
-            npy_intp m, p;
-            char *op=args[2];
-            for (m = 0; m < dm; m++) {
-                for (p = 0; p < dp; p++) {
-#if @SPECL@ == 1
-                    (*(@typ@ *)op).real = 0;
-                    (*(@typ@ *)op).imag = 0;
-#else
-                    *(@typ@ *)op = 0;
-#endif
-                    op  +=  os_p;
-                }
-                op  +=  os_m - ob_p;
-            }
-        }
-        return;
-    }
     for (iOuter = 0; iOuter < dOuter; iOuter++,
                          args[0] += s0, args[1] += s1, args[2] += s2) {
-        char *ip1=args[0], *ip2=args[1], *op=args[2];
-#if @USEBLAS@ & defined(HAVE_CBLAS)
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if @USEBLAS@ && defined(HAVE_CBLAS)
         /*
          * TODO: refactor this out to a inner_loop_selector, in
          * PyUFunc_MatmulLoopSelector. But that call does not have access to
          * n, m, p and strides.
          */
-        if (too_big_for_blas) {
-            @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
-                      ib1_n, ib2_n, ib2_p, ob_p,
-                      is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+        if (too_big_for_blas || any_zero_dim) {
+            @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
         }
         else if (special_case) {
             /* Special case variants that have a 1 in the core dimensions */
@@ -355,44 +337,59 @@ NPY_NO_EXPORT void
                 @TYPE@_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
             } else if (scalar_vec){
                 /*
-                 * 0d @ vector or vector @ 0d
+                 * 1,1d @ vector or vector @ 1,1d
                  * could use cblas_Xaxy, but that requires 0ing output
                  * and would not be faster (XXX prove it)
                  */
-                @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
-                          ib1_n, ib2_n, ib2_p, ob_p,
-                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
             } else if (vector_matrix) {
                 /* vector @ matrix, switch ip1, ip2, p and m */
-                @TYPE@_gemv((void*)ip2, is2_n, (void*)ip1, is1_n,
-                            (void*)op, dp, dn);
+                @TYPE@_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
             } else if  (matrix_vector) {
                 /* matrix @ vector */
-                @TYPE@_gemv((void*)ip1, is1_n, (void*)ip2, is2_n,
-                            (void*)op, dm, dn);
+                @TYPE@_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
             } else {
-                /* column @ row, 2d output, no blas needed or non-contiguous input */
-                @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
-                          ib1_n, ib2_n, ib2_p, ob_p,
-                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
             }
         } else {
             /* matrix @ matrix */
-            if (input_contiguous) {
-                /* can only use blas if input is contiguous */
-                @TYPE@_matmul_matrixmatrix((void*)ip1, is1_m, is1_n,
-                                           (void*)ip2, is2_n, is2_p,
-                                           (void*)op, dm, dn, dp);
+            if (i1blasable && i2blasable && o_c_blasable) {
+                @TYPE@_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                @TYPE@_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
             } else {
-                @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
-                          ib1_n, ib2_n, ib2_p, ob_p,
-                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
             }
         }
 #else
-        @TYPE@_matmul_inner_noblas(ip1, ip2, op, dm, dn, dp,
-                          ib1_n, ib2_n, ib2_p, ob_p,
-                          is1_m, is1_n, is2_n, is2_p, os_m, os_p);
+        @TYPE@_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
 
 #endif
     }
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 66f512f7b..5dc1db226 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -319,6 +319,23 @@ _find_array_prepare(ufunc_full_args args,
     NPY_ITER_NO_BROADCAST | \
     NPY_ITER_NO_SUBTYPE | \
     NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
+
+/* Called at module initialization to set the matmul ufunc output flags */
+NPY_NO_EXPORT int
+set_matmul_flags(PyObject *d)
+{
+    PyObject *matmul = PyDict_GetItemString(d, "matmul");
+    if (matmul == NULL) {
+        return -1;
+    }
+    ((PyUFuncObject *)matmul)->op_flags[2] = (NPY_ITER_WRITEONLY |
+                                         NPY_ITER_UPDATEIFCOPY |
+                                         NPY_UFUNC_DEFAULT_OUTPUT_FLAGS) &
+                                         ~NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE;
+    return 0;
+}
+
+
 /*
  * Set per-operand flags according to desired input or output flags.
  * op_flags[i] for i in input (as determined by ufunc->nin) will be
diff --git a/numpy/core/src/umath/ufunc_type_resolution.c b/numpy/core/src/umath/ufunc_type_resolution.c
index efd923972..ec60d9cfd 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.c
+++ b/numpy/core/src/umath/ufunc_type_resolution.c
@@ -1304,89 +1304,6 @@ PyUFunc_MixedDivisionTypeResolver(PyUFuncObject *ufunc,
                                         type_tup, out_dtypes);
 }
 
-/*
- * XXX This is too restrictive, we should only check the inner axis involved
- * gh-12365
- */
-#define NOT_CONTIGUOUS(a) (PyArray_NDIM(a) >=2 && !( \
-        PyArray_IS_C_CONTIGUOUS(a) || PyArray_IS_F_CONTIGUOUS(a)))
-
-/*
- * This function applies special type resolution rules for the case
- * where all the functions have the pattern XX->X, using
- * PyArray_ResultType instead of a linear search to get the best
- * loop, like PyUFunc_SimpleBinaryOperationTypeResolver, and adds
- * memory overlap and contiguity considerations to the operands,
- * possibly creating Writeback temporary data
- *
- * Returns 0 on success, -1 on error.
- */
-NPY_NO_EXPORT int
-PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
-                           NPY_CASTING casting,
-                           PyArrayObject **operands,
-                           PyObject *type_tup,
-                           PyArray_Descr **out_dtypes)
-{
-
-    if (PyUFunc_SimpleBinaryOperationTypeResolver(ufunc, casting, operands,
-            type_tup, out_dtypes) < 0) {
-        return -1;
-    }
-    if (PyArray_NDIM(operands[0]) >= 2 || PyArray_NDIM(operands[1]) >= 2) {
-#if defined(HAVE_CBLAS)
-        int typenum =  out_dtypes[2]->type_num;
-        if (  (NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
-                NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
-            /*
-             * We are going to use BLAS
-             * make sure 2d and more arrays are contiguous
-             */
-            if (_bad_strides(operands[0]) || NOT_CONTIGUOUS(operands[0])) {
-                    PyObject *op = PyArray_NewCopy(operands[0], NPY_ANYORDER);
-
-                    if (op == NULL) {
-                       return -1;
-                    }
-                    Py_DECREF(operands[0]);
-                    operands[0] = (PyArrayObject *)op;
-            }
-            if (_bad_strides(operands[1]) || NOT_CONTIGUOUS(operands[1])) {
-                    PyObject *op = PyArray_NewCopy(operands[1], NPY_ANYORDER);
-
-                    if (op == NULL) {
-                        return -1;
-                    }
-                    Py_DECREF(operands[1]);
-                    operands[1] = (PyArrayObject *)op;
-            }
-        }
-#endif
-        if (operands[2] != NULL) {
-            npy_intp last_stride = PyArray_STRIDE(operands[2],
-                                                  PyArray_NDIM(operands[2]) - 1);
-            if (last_stride != PyArray_ITEMSIZE(operands[2])) {
-                PyErr_SetString(PyExc_ValueError,
-                    "output array is not acceptable (must be C-contiguous)");
-                return -1;
-            }
-            /*
-             * Use all the flags in PyUFunc_GeneralizedFunction
-             * except NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE and
-             * NPY_ITER_WRITEONLY. Without NPY_ITER_OVERLAP_ASSUME_ELEMENTWISE
-             * nditer will properly check overlap between output and any input.
-             */
-            ufunc->op_flags[2] = NPY_ITER_WRITEONLY |
-                      NPY_ITER_UPDATEIFCOPY |
-                      NPY_ITER_ALIGNED |
-                      NPY_ITER_ALLOCATE |
-                      NPY_ITER_NO_BROADCAST;
-        }
-    }
-    return 0;
-}
-
-
 static int
 find_userloop(PyUFuncObject *ufunc,
                 PyArray_Descr **dtypes,
diff --git a/numpy/core/src/umath/ufunc_type_resolution.h b/numpy/core/src/umath/ufunc_type_resolution.h
index 5306a5983..2f37af753 100644
--- a/numpy/core/src/umath/ufunc_type_resolution.h
+++ b/numpy/core/src/umath/ufunc_type_resolution.h
@@ -145,11 +145,4 @@ PyUFunc_DefaultMaskedInnerLoopSelector(PyUFuncObject *ufunc,
                                       NpyAuxData **out_innerloopdata,
                                       int *out_needs_api);
 
-NPY_NO_EXPORT int
-PyUFunc_MatmulTypeResolver(PyUFuncObject *ufunc,
-                              NPY_CASTING casting,
-                              PyArrayObject **operands,
-                              PyObject *type_tup,
-                              PyArray_Descr **out_dtypes);
-
 #endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index 80c69d9b3..fbf8ac0f5 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -5716,7 +5716,7 @@ class MatmulCommon(object):
             res = self.matmul(v, v)
             assert_(type(res) is np.dtype(dt).type)
 
-    def test_0d_vector_values(self):
+    def test_scalar_output(self):
         vec1 = np.array([2])
         vec2 = np.array([3, 4]).reshape(1, -1)
         tgt = np.array([6, 8])
@@ -5876,33 +5876,29 @@ class TestMatmul(MatmulCommon):
     matmul = np.matmul
 
     def test_out_arg(self):
-        a = np.ones((2, 2), dtype=float)
-        b = np.ones((2, 2), dtype=float)
-        tgt = np.full((2,2), 2, dtype=float)
+        a = np.ones((5, 2), dtype=float)
+        b = np.array([[1, 3], [5, 7]], dtype=float)
+        tgt = np.dot(a, b)
 
         # test as positional argument
         msg = "out positional argument"
-        out = np.zeros((2, 2), dtype=float)
+        out = np.zeros((5, 2), dtype=float)
         self.matmul(a, b, out)
         assert_array_equal(out, tgt, err_msg=msg)
 
         # test as keyword argument
         msg = "out keyword argument"
-        out = np.zeros((2, 2), dtype=float)
+        out = np.zeros((5, 2), dtype=float)
         self.matmul(a, b, out=out)
         assert_array_equal(out, tgt, err_msg=msg)
 
         # test out with not allowed type cast (safe casting)
         msg = "Cannot cast ufunc matmul output"
-        out = np.zeros((2, 2), dtype=np.int32)
+        out = np.zeros((5, 2), dtype=np.int32)
         assert_raises_regex(TypeError, msg, self.matmul, a, b, out=out)
 
-        # cblas does not allow non-contiguous
-        # outputs and consistency with dot would require same type,
-        # dimensions, subtype, and c_contiguous.
-
         # test out with type upcast to complex
-        out = np.zeros((2, 2), dtype=np.complex128)
+        out = np.zeros((5, 2), dtype=np.complex128)
         c = self.matmul(a, b, out=out)
         assert_(c is out)
         with suppress_warnings() as sup:
@@ -5910,15 +5906,39 @@ class TestMatmul(MatmulCommon):
             c = c.astype(tgt.dtype)
         assert_array_equal(c, tgt)
 
+    def test_out_contiguous(self):
+        a = np.ones((5, 2), dtype=float)
+        b = np.array([[1, 3], [5, 7]], dtype=float)
+        v = np.array([1, 3], dtype=float)
+        tgt = np.dot(a, b)
+        tgt_mv = np.dot(a, v)
+
         # test out non-contiguous
-        out = np.ones((2, 2, 2), dtype=float)
-        assert_raises(ValueError, self.matmul, a, b, out=out[..., 0])
+        out = np.ones((5, 2, 2), dtype=float)
+        c = self.matmul(a, b, out=out[..., 0])
+        assert c.base is out
+        assert_array_equal(c, tgt)
+        c = self.matmul(a, v, out=out[:, 0, 0])
+        assert_array_equal(c, tgt_mv)
+        c = self.matmul(v, a.T, out=out[:, 0, 0])
+        assert_array_equal(c, tgt_mv)
+
+        # test out contiguous in only last dim
+        out = np.ones((10, 2), dtype=float)
+        c = self.matmul(a, b, out=out[::2, :])
+        assert_array_equal(c, tgt)
+
+        # test transposes of out, args
+        out = np.ones((5, 2), dtype=float)
+        c = self.matmul(b.T, a.T, out=out.T)
+        assert_array_equal(out, tgt)
 
     m1 = np.arange(15.).reshape(5, 3)
     m2 = np.arange(21.).reshape(3, 7)
     m3 = np.arange(30.).reshape(5, 6)[:, ::2]  # non-contiguous
     vc = np.arange(10.)
     vr = np.arange(6.)
+    m0 = np.zeros((3, 0))
     @pytest.mark.parametrize('args', (
             # matrix-matrix
             (m1, m2), (m2.T, m1.T), (m2.T.copy(), m1.T), (m2.T, m1.T.copy()),
@@ -5934,12 +5954,19 @@ class TestMatmul(MatmulCommon):
             # vector-matrix, matrix-vector, matrix non-contiguous
             (m3, vr[:3]), (vc[:5], m3), (m3.T, vc[:5]), (vr[:3], m3.T),
             # vector-matrix, matrix-vector, both non-contiguous
-            (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T)))
+            (m3, vr[::2]), (vc[::2], m3), (m3.T, vc[::2]), (vr[::2], m3.T),
+            # size == 0
+            (m0, m0.T), (m0.T, m0), (m1, m0), (m0.T, m1.T),
+        ))
     def test_dot_equivalent(self, args):
         r1 = np.matmul(*args)
         r2 = np.dot(*args)
         assert_equal(r1, r2)
 
+        r3 = np.matmul(args[0].copy(), args[1].copy())
+        assert_equal(r1, r3)
+        
+
 
 if sys.version_info[:2] >= (3, 5):
     class TestMatmulOperator(MatmulCommon):
author	mattip <matti.picus@gmail.com>	2018-11-25 10:46:21 -0600
committer	mattip <matti.picus@gmail.com>	2018-12-01 16:13:59 -0800
commit	72f2abf34a43cf3642acf6a6ff951148051404ae (patch)
tree	d6df709bea86487caf12595f81f6ca76e7e8a373 /numpy/core
parent	8c9450a7fd69d5b74b47ffec60b5c235361daeff (diff)
download	numpy-72f2abf34a43cf3642acf6a6ff951148051404ae.tar.gz