8 files changed, 178 insertions, 30 deletions
diff --git a/numpy/core/code_generators/ufunc_docstrings.py b/numpy/core/code_generators/ufunc_docstrings.py
index 129516658..82cd6fb27 100644
--- a/numpy/core/code_generators/ufunc_docstrings.py
+++ b/numpy/core/code_generators/ufunc_docstrings.py
@@ -1835,6 +1835,17 @@ add_newdoc('numpy.core.umath', 'left_shift',
     >>> np.left_shift(5, [1,2,3])
     array([10, 20, 40])
 
+    Note that the dtype of the second argument may change the dtype of the
+    result and can lead to unexpected results in some cases (see
+    :ref:`Casting Rules <ufuncs.casting>`):
+
+    >>> a = np.left_shift(np.uint8(255), 1) # Expect 254
+    >>> print(a, type(a)) # Unexpected result due to upcasting
+    510 <class 'numpy.int64'>
+    >>> b = np.left_shift(np.uint8(255), np.uint8(1))
+    >>> print(b, type(b))
+    254 <class 'numpy.uint8'>
+
     """)
 
 add_newdoc('numpy.core.umath', 'less',
diff --git a/numpy/core/memmap.py b/numpy/core/memmap.py
index ad66446c2..cb025736e 100644
--- a/numpy/core/memmap.py
+++ b/numpy/core/memmap.py
@@ -209,10 +209,12 @@ class memmap(ndarray):
         import os.path
         try:
             mode = mode_equivalents[mode]
-        except KeyError:
+        except KeyError as e:
             if mode not in valid_filemodes:
-                raise ValueError("mode must be one of %s" %
-                                 (valid_filemodes + list(mode_equivalents.keys())))
+                raise ValueError(
+                    "mode must be one of {!r} (got {!r})"
+                    .format(valid_filemodes + list(mode_equivalents.keys()), mode)
+                ) from None
 
         if mode == 'w+' and shape is None:
             raise ValueError("shape must be given")
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index 6d3afdd49..5ee326f3c 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -2613,12 +2613,13 @@ def norm(x, ord=None, axis=None, keepdims=False):
 
 # multi_dot
 
-def _multidot_dispatcher(arrays):
-    return arrays
+def _multidot_dispatcher(arrays, *, out=None):
+    yield from arrays
+    yield out
 
 
 @array_function_dispatch(_multidot_dispatcher)
-def multi_dot(arrays):
+def multi_dot(arrays, *, out=None):
     """
     Compute the dot product of two or more arrays in a single function call,
     while automatically selecting the fastest evaluation order.
@@ -2642,6 +2643,15 @@ def multi_dot(arrays):
         If the first argument is 1-D it is treated as row vector.
         If the last argument is 1-D it is treated as column vector.
         The other arguments must be 2-D.
+    out : ndarray, optional
+        Output argument. This must have the exact kind that would be returned
+        if it was not used. In particular, it must have the right type, must be
+        C-contiguous, and its dtype must be the dtype that would be returned
+        for `dot(a, b)`. This is a performance feature. Therefore, if these
+        conditions are not met, an exception is raised, instead of attempting
+        to be flexible.
+
+        .. versionadded:: 1.19.0
 
     Returns
     -------
@@ -2699,7 +2709,7 @@ def multi_dot(arrays):
     if n < 2:
         raise ValueError("Expecting at least two arrays.")
     elif n == 2:
-        return dot(arrays[0], arrays[1])
+        return dot(arrays[0], arrays[1], out=out)
 
     arrays = [asanyarray(a) for a in arrays]
 
@@ -2715,10 +2725,10 @@ def multi_dot(arrays):
 
     # _multi_dot_three is much faster than _multi_dot_matrix_chain_order
     if n == 3:
-        result = _multi_dot_three(arrays[0], arrays[1], arrays[2])
+        result = _multi_dot_three(arrays[0], arrays[1], arrays[2], out=out)
     else:
         order = _multi_dot_matrix_chain_order(arrays)
-        result = _multi_dot(arrays, order, 0, n - 1)
+        result = _multi_dot(arrays, order, 0, n - 1, out=out)
 
     # return proper shape
     if ndim_first == 1 and ndim_last == 1:
@@ -2729,7 +2739,7 @@ def multi_dot(arrays):
         return result
 
 
-def _multi_dot_three(A, B, C):
+def _multi_dot_three(A, B, C, out=None):
     """
     Find the best order for three arrays and do the multiplication.
 
@@ -2745,9 +2755,9 @@ def _multi_dot_three(A, B, C):
     cost2 = a1b0 * c1 * (a0 + b1c0)
 
     if cost1 < cost2:
-        return dot(dot(A, B), C)
+        return dot(dot(A, B), C, out=out)
     else:
-        return dot(A, dot(B, C))
+        return dot(A, dot(B, C), out=out)
 
 
 def _multi_dot_matrix_chain_order(arrays, return_costs=False):
@@ -2791,10 +2801,14 @@ def _multi_dot_matrix_chain_order(arrays, return_costs=False):
     return (s, m) if return_costs else s
 
 
-def _multi_dot(arrays, order, i, j):
+def _multi_dot(arrays, order, i, j, out=None):
     """Actually do the multiplication with the given order."""
     if i == j:
+        # the initial call with non-None out should never get here
+        assert out is None
+
         return arrays[i]
     else:
         return dot(_multi_dot(arrays, order, i, order[i, j]),
-                   _multi_dot(arrays, order, order[i, j] + 1, j))
+                   _multi_dot(arrays, order, order[i, j] + 1, j),
+                   out=out)
diff --git a/numpy/linalg/tests/test_linalg.py b/numpy/linalg/tests/test_linalg.py
index dae4ef61e..3f3bf9f70 100644
--- a/numpy/linalg/tests/test_linalg.py
+++ b/numpy/linalg/tests/test_linalg.py
@@ -1930,6 +1930,41 @@ class TestMultiDot:
         # the result should be a scalar
         assert_equal(multi_dot([A1d, B, C, D1d]).shape, ())
 
+    def test_three_arguments_and_out(self):
+        # multi_dot with three arguments uses a fast hand coded algorithm to
+        # determine the optimal order. Therefore test it separately.
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+
+        out = np.zeros((6, 2))
+        ret = multi_dot([A, B, C], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B).dot(C))
+        assert_almost_equal(out, np.dot(A, np.dot(B, C)))
+
+    def test_two_arguments_and_out(self):
+        # separate code path with two arguments
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        out = np.zeros((6, 6))
+        ret = multi_dot([A, B], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B))
+        assert_almost_equal(out, np.dot(A, B))
+
+    def test_dynamic_programing_optimization_and_out(self):
+        # multi_dot with four or more arguments uses the dynamic programing
+        # optimization and therefore deserve a separate test
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 1))
+        out = np.zeros((6, 1))
+        ret = multi_dot([A, B, C, D], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B).dot(C).dot(D))
+
     def test_dynamic_programming_logic(self):
         # Test for the dynamic programming part
         # This test is directly taken from Cormen page 376.
diff --git a/numpy/ma/core.py b/numpy/ma/core.py
index a5e59bb74..a7214f9bf 100644
--- a/numpy/ma/core.py
+++ b/numpy/ma/core.py
@@ -285,8 +285,10 @@ def _extremum_fill_value(obj, extremum, extremum_name):
     def _scalar_fill_value(dtype):
         try:
             return extremum[dtype]
-        except KeyError:
-            raise TypeError(f"Unsuitable type {dtype} for calculating {extremum_name}.")
+        except KeyError as e:
+            raise TypeError(
+                f"Unsuitable type {dtype} for calculating {extremum_name}."
+            ) from None
 
     dtype = _get_dtype_of(obj)
     return _recursive_fill_value(dtype, _scalar_fill_value)
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index b976d51c6..27cb2859e 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -4007,10 +4007,12 @@ cdef class Generator:
         # return val
 
         cdef np.npy_intp k, totsize, i, j
-        cdef np.ndarray alpha_arr, val_arr
+        cdef np.ndarray alpha_arr, val_arr, alpha_csum_arr
+        cdef double csum
         cdef double *alpha_data
+        cdef double *alpha_csum_data
         cdef double *val_data
-        cdef double acc, invacc
+        cdef double acc, invacc, v
 
         k = len(alpha)
         alpha_arr = <np.ndarray>np.PyArray_FROMANY(
@@ -4034,17 +4036,74 @@ cdef class Generator:
 
         i = 0
         totsize = np.PyArray_SIZE(val_arr)
-        with self.lock, nogil:
-            while i < totsize:
-                acc = 0.0
-                for j in range(k):
-                    val_data[i+j] = random_standard_gamma(&self._bitgen,
-                                                              alpha_data[j])
-                    acc = acc + val_data[i + j]
-                invacc = 1/acc
-                for j in range(k):
-                    val_data[i + j] = val_data[i + j] * invacc
-                i = i + k
+
+        # Select one of the following two algorithms for the generation
+        #  of Dirichlet random variates (RVs)
+        #
+        # A) Small alpha case: Use the stick-breaking approach with beta
+        #    random variates (RVs).
+        # B) Standard case: Perform unit normalisation of a vector
+        #    of gamma random variates
+        #
+        # A) prevents NaNs resulting from 0/0 that may occur in B)
+        # when all values in the vector ':math:\\alpha' are smaller
+        # than 1, then there is a nonzero probability that all
+        # generated gamma RVs will be 0. When that happens, the
+        # normalization process ends up computing 0/0, giving nan. A)
+        # does not use divisions, so that a situation in which 0/0 has
+        # to be computed cannot occur. A) is slower than B) as
+        # generation of beta RVs is slower than generation of gamma
+        # RVs. A) is selected whenever `alpha.max() < t`, where `t <
+        # 1` is a threshold that controls the probability of
+        # generating a NaN value when B) is used. For a given
+        # threshold `t` this probability can be bounded by
+        # `gammainc(t, d)` where `gammainc` is the regularized
+        # incomplete gamma function and `d` is the smallest positive
+        # floating point number that can be represented with a given
+        # precision. For the chosen threshold `t=0.1` this probability
+        # is smaller than `1.8e-31` for double precision floating
+        # point numbers.
+
+        if (k > 0) and (alpha_arr.max() < 0.1):
+            # Small alpha case: Use stick-breaking approach with beta
+            # random variates (RVs).
+            # alpha_csum_data will hold the cumulative sum, right to
+            # left, of alpha_arr.
+            # Use a numpy array for memory management only.  We could just as
+            # well have malloc'd alpha_csum_data.  alpha_arr is a C-contiguous
+            # double array, therefore so is alpha_csum_arr.
+            alpha_csum_arr = np.empty_like(alpha_arr)
+            alpha_csum_data = <double*>np.PyArray_DATA(alpha_csum_arr)
+            csum = 0.0
+            for j in range(k - 1, -1, -1):
+                csum += alpha_data[j]
+                alpha_csum_data[j] = csum
+
+            with self.lock, nogil:
+                while i < totsize:
+                    acc = 1.
+                    for j in range(k - 1):
+                        v = random_beta(&self._bitgen, alpha_data[j],
+                                        alpha_csum_data[j + 1])
+                        val_data[i + j] = acc * v
+                        acc *= (1. - v)
+                    val_data[i + k - 1] = acc
+                    i = i + k
+
+        else:
+            # Standard case: Unit normalisation of a vector of gamma random
+            # variates
+            with self.lock, nogil:
+                while i < totsize:
+                    acc = 0.
+                    for j in range(k):
+                        val_data[i + j] = random_standard_gamma(&self._bitgen,
+                                                                alpha_data[j])
+                        acc = acc + val_data[i + j]
+                    invacc = 1. / acc
+                    for j in range(k):
+                        val_data[i + j] = val_data[i + j] * invacc
+                    i = i + k
 
         return diric
 
diff --git a/numpy/random/_pcg64.pyx b/numpy/random/_pcg64.pyx
index 05d27db5c..605aae4bc 100644
--- a/numpy/random/_pcg64.pyx
+++ b/numpy/random/_pcg64.pyx
@@ -38,7 +38,7 @@ cdef double pcg64_double(void* st) nogil:
 
 cdef class PCG64(BitGenerator):
     """
-    PCG64(seed_seq=None)
+    PCG64(seed=None)
 
     BitGenerator for the PCG-64 pseudo-random number generator.
 
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index 08b44e4db..a28b7ca11 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -1103,6 +1103,31 @@ class TestRandomDist:
                                   size=(3, 2))
         assert_array_almost_equal(non_contig, contig)
 
+    def test_dirichlet_small_alpha(self):
+        eps = 1.0e-9  # 1.0e-10 -> runtime x 10; 1e-11 -> runtime x 200, etc.
+        alpha = eps * np.array([1., 1.0e-3])
+        random = Generator(MT19937(self.seed))
+        actual = random.dirichlet(alpha, size=(3, 2))
+        expected = np.array([
+            [[1., 0.],
+             [1., 0.]],
+            [[1., 0.],
+             [1., 0.]],
+            [[1., 0.],
+             [1., 0.]]
+        ])
+        assert_array_almost_equal(actual, expected, decimal=15)
+
+    @pytest.mark.slow
+    def test_dirichlet_moderately_small_alpha(self):
+        # Use alpha.max() < 0.1 to trigger stick breaking code path
+        alpha = np.array([0.02, 0.04, 0.03])
+        exact_mean = alpha / alpha.sum()
+        random = Generator(MT19937(self.seed))
+        sample = random.dirichlet(alpha, size=20000000)
+        sample_mean = sample.mean(axis=0)
+        assert_allclose(sample_mean, exact_mean, rtol=1e-3)
+
     def test_exponential(self):
         random = Generator(MT19937(self.seed))
         actual = random.exponential(1.1234, size=(3, 2))