1 files changed, 250 insertions, 65 deletions
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index cc2852da7..7ffa36775 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -5,6 +5,7 @@ import warnings
 
 from cpython.pycapsule cimport PyCapsule_IsValid, PyCapsule_GetPointer
 from cpython cimport (Py_INCREF, PyFloat_AsDouble)
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 cimport cython
 import numpy as np
@@ -25,8 +26,16 @@ from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
             CONS_GT_1, CONS_POSITIVE_NOT_NAN, CONS_POISSON,
             double_fill, cont, kahan_sum, cont_broadcast_3, float_fill, cont_f,
             check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
+            validate_output_shape
         )
 
+cdef extern from "numpy/arrayobject.h":
+    int PyArray_ResolveWritebackIfCopy(np.ndarray)
+    object PyArray_FromArray(np.PyArrayObject *, np.PyArray_Descr *, int)
+
+    enum:
+        NPY_ARRAY_WRITEBACKIFCOPY
+
 np.import_array()
 
 cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
@@ -47,6 +56,77 @@ cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors):
     return sum
 
 
+cdef inline void _shuffle_raw_wrap(bitgen_t *bitgen, np.npy_intp n,
+                                   np.npy_intp first, np.npy_intp itemsize,
+                                   np.npy_intp stride,
+                                   char* data, char* buf) nogil:
+    # We trick gcc into providing a specialized implementation for
+    # the most common case, yielding a ~33% performance improvement.
+    # Note that apparently, only one branch can ever be specialized.
+    if itemsize == sizeof(np.npy_intp):
+        _shuffle_raw(bitgen, n, first, sizeof(np.npy_intp), stride, data, buf)
+    else:
+        _shuffle_raw(bitgen, n, first, itemsize, stride, data, buf)
+
+
+cdef inline void _shuffle_raw(bitgen_t *bitgen, np.npy_intp n,
+                              np.npy_intp first, np.npy_intp itemsize,
+                              np.npy_intp stride,
+                              char* data, char* buf) nogil:
+    """
+    Parameters
+    ----------
+    bitgen
+        Pointer to a bitgen_t instance.
+    n
+        Number of elements in data
+    first
+        First observation to shuffle.  Shuffles n-1,
+        n-2, ..., first, so that when first=1 the entire
+        array is shuffled
+    itemsize
+        Size in bytes of item
+    stride
+        Array stride
+    data
+        Location of data
+    buf
+        Location of buffer (itemsize)
+    """
+    cdef np.npy_intp i, j
+
+    for i in reversed(range(first, n)):
+        j = random_interval(bitgen, i)
+        string.memcpy(buf, data + j * stride, itemsize)
+        string.memcpy(data + j * stride, data + i * stride, itemsize)
+        string.memcpy(data + i * stride, buf, itemsize)
+
+
+cdef inline void _shuffle_int(bitgen_t *bitgen, np.npy_intp n,
+                              np.npy_intp first, int64_t* data) nogil:
+    """
+    Parameters
+    ----------
+    bitgen
+        Pointer to a bitgen_t instance.
+    n
+        Number of elements in data
+    first
+        First observation to shuffle.  Shuffles n-1,
+        n-2, ..., first, so that when first=1 the entire
+        array is shuffled
+    data
+        Location of data
+    """
+    cdef np.npy_intp i, j
+    cdef int64_t temp
+    for i in reversed(range(first, n)):
+        j = random_bounded_uint64(bitgen, 0, i, 0, 0)
+        temp = data[j]
+        data[j] = data[i]
+        data[i] = temp
+
+
 cdef bint _check_bit_generator(object bitgen):
     """Check if an object satisfies the BitGenerator interface.
     """
@@ -707,8 +787,8 @@ cdef class Generator:
                     idx = np.PyArray_Arange(0, pop_size_i, 1, np.NPY_INT64)
                     idx_data = <int64_t*>(<np.ndarray>idx).data
                     with self.lock, nogil:
-                        self._shuffle_int(pop_size_i, max(pop_size_i - size_i, 1),
-                                          idx_data)
+                        _shuffle_int(&self._bitgen, pop_size_i,
+                                     max(pop_size_i - size_i, 1), idx_data)
                     # Copy to allow potentially large array backing idx to be gc
                     idx = idx[(pop_size - size):].copy()
                 else:
@@ -736,7 +816,7 @@ cdef class Generator:
                                 hash_set[loc] = j
                                 idx_data[j - pop_size_i + size_i] = j
                         if shuffle:
-                            self._shuffle_int(size_i, 1, idx_data)
+                            _shuffle_int(&self._bitgen, size_i, 1, idx_data)
                 idx.shape = shape
 
         if is_scalar and isinstance(idx, np.ndarray):
@@ -2809,6 +2889,7 @@ cdef class Generator:
             cnt = np.PyArray_SIZE(randoms)
 
             it = np.PyArray_MultiIterNew3(randoms, p_arr, n_arr)
+            validate_output_shape(it.shape, randoms)
             with self.lock, nogil:
                 for i in range(cnt):
                     _dp = (<double*>np.PyArray_MultiIter_DATA(it, 1))[0]
@@ -3606,7 +3687,7 @@ cdef class Generator:
         Now, do one experiment throwing the dice 10 time, and 10 times again,
         and another throwing the dice 20 times, and 20 times again:
 
-        >>> rng.multinomial([[10], [20]], [1/6.]*6, size=2)
+        >>> rng.multinomial([[10], [20]], [1/6.]*6, size=(2, 2))
         array([[[2, 4, 0, 1, 2, 1],
                 [1, 3, 0, 3, 1, 2]],
                [[1, 4, 4, 4, 4, 3],
@@ -3661,6 +3742,7 @@ cdef class Generator:
                 temp = np.empty(size, dtype=np.int8)
                 temp_arr = <np.ndarray>temp
                 it = np.PyArray_MultiIterNew2(on, temp_arr)
+                validate_output_shape(it.shape, temp_arr)
             shape = it.shape + (d,)
             multin = np.zeros(shape, dtype=np.int64)
             mnarr = <np.ndarray>multin
@@ -3941,7 +4023,7 @@ cdef class Generator:
             The drawn samples, of shape ``(size, k)``.
 
         Raises
-        -------
+        ------
         ValueError
             If any value in ``alpha`` is less than or equal to zero
 
@@ -4111,7 +4193,159 @@ cdef class Generator:
 
         return diric
 
-    # Shuffling and permutations:
+    def permuted(self, object x, *, axis=None, out=None):
+        """
+        permuted(x, axis=None, out=None)
+
+        Randomly permute `x` along axis `axis`.
+
+        Unlike `shuffle`, each slice along the given axis is shuffled
+        independently of the others.
+
+        Parameters
+        ----------
+        x : array_like, at least one-dimensional
+            Array to be shuffled.
+        axis : int, optional
+            Slices of `x` in this axis are shuffled. Each slice
+            is shuffled independently of the others.  If `axis` is
+            None, the flattened array is shuffled.
+        out : ndarray, optional
+            If given, this is the destinaton of the shuffled array.
+            If `out` is None, a shuffled copy of the array is returned.
+
+        Returns
+        -------
+        ndarray
+            If `out` is None, a shuffled copy of `x` is returned.
+            Otherwise, the shuffled array is stored in `out`,
+            and `out` is returned
+
+        See Also
+        --------
+        shuffle
+        permutation
+
+        Examples
+        --------
+        Create a `numpy.random.Generator` instance:
+
+        >>> rng = np.random.default_rng()
+
+        Create a test array:
+
+        >>> x = np.arange(24).reshape(3, 8)
+        >>> x
+        array([[ 0,  1,  2,  3,  4,  5,  6,  7],
+               [ 8,  9, 10, 11, 12, 13, 14, 15],
+               [16, 17, 18, 19, 20, 21, 22, 23]])
+
+        Shuffle the rows of `x`:
+
+        >>> y = rng.permuted(x, axis=1)
+        >>> y
+        array([[ 4,  3,  6,  7,  1,  2,  5,  0],  # random
+               [15, 10, 14,  9, 12, 11,  8, 13],
+               [17, 16, 20, 21, 18, 22, 23, 19]])
+
+        `x` has not been modified:
+
+        >>> x
+        array([[ 0,  1,  2,  3,  4,  5,  6,  7],
+               [ 8,  9, 10, 11, 12, 13, 14, 15],
+               [16, 17, 18, 19, 20, 21, 22, 23]])
+
+        To shuffle the rows of `x` in-place, pass `x` as the `out`
+        parameter:
+
+        >>> y = rng.permuted(x, axis=1, out=x)
+        >>> x
+        array([[ 3,  0,  4,  7,  1,  6,  2,  5],  # random
+               [ 8, 14, 13,  9, 12, 11, 15, 10],
+               [17, 18, 16, 22, 19, 23, 20, 21]])
+
+        Note that when the ``out`` parameter is given, the return
+        value is ``out``:
+
+        >>> y is x
+        True
+        """
+
+        cdef int ax
+        cdef np.npy_intp axlen, axstride, itemsize
+        cdef void *buf
+        cdef np.flatiter it
+        cdef np.ndarray to_shuffle
+        cdef int status
+        cdef int flags
+
+        x = np.asarray(x)
+
+        if out is None:
+            out = x.copy(order='K')
+        else:
+            if type(out) is not np.ndarray:
+                raise TypeError('out must be a numpy array')
+            if out.shape != x.shape:
+                raise ValueError('out must have the same shape as x')
+            np.copyto(out, x, casting='safe')
+
+        if axis is None:
+            if x.ndim > 1:
+                if not (np.PyArray_FLAGS(out) & (np.NPY_ARRAY_C_CONTIGUOUS |
+                                                 np.NPY_ARRAY_F_CONTIGUOUS)): 
+                    flags = (np.NPY_ARRAY_C_CONTIGUOUS |
+                             NPY_ARRAY_WRITEBACKIFCOPY)
+                    to_shuffle = PyArray_FromArray(<np.PyArrayObject *>out,
+                                                   <np.PyArray_Descr *>NULL, flags)
+                    self.shuffle(to_shuffle.ravel(order='K'))
+                    # Because we only execute this block if out is not
+                    # contiguous, we know this call will always result in a
+                    # copy of to_shuffle back to out. I.e. status will be 1.
+                    status = PyArray_ResolveWritebackIfCopy(to_shuffle)
+                    assert status == 1
+                else:
+                    # out is n-d with n > 1, but is either C- or F-contiguous,
+                    # so we know out.ravel(order='A') is a view.
+                    self.shuffle(out.ravel(order='A'))
+            else:
+                # out is 1-d
+                self.shuffle(out)
+            return out
+
+        ax = normalize_axis_index(axis, np.ndim(out))
+        itemsize = out.itemsize
+        axlen = out.shape[ax]
+        axstride = out.strides[ax]
+
+        it = np.PyArray_IterAllButAxis(out, &ax)
+
+        buf = PyMem_Malloc(itemsize)
+        if buf == NULL:
+            raise MemoryError('memory allocation failed in permuted')
+
+        if out.dtype.hasobject:
+            # Keep the GIL when shuffling an object array.
+            with self.lock:
+                while np.PyArray_ITER_NOTDONE(it):
+                    _shuffle_raw_wrap(&self._bitgen, axlen, 0, itemsize,
+                                      axstride,
+                                      <char *>np.PyArray_ITER_DATA(it),
+                                      <char *>buf)
+                    np.PyArray_ITER_NEXT(it)
+        else:
+            # out is not an object array, so we can release the GIL.
+            with self.lock, nogil:
+                while np.PyArray_ITER_NOTDONE(it):
+                    _shuffle_raw_wrap(&self._bitgen, axlen, 0, itemsize,
+                                      axstride,
+                                      <char *>np.PyArray_ITER_DATA(it),
+                                      <char *>buf)
+                    np.PyArray_ITER_NEXT(it)
+
+        PyMem_Free(buf)
+        return out
+
     def shuffle(self, object x, axis=0):
         """
         shuffle(x, axis=0)
@@ -4174,14 +4408,15 @@ cdef class Generator:
             # when the function exits.
             buf = np.empty(itemsize, dtype=np.int8)  # GC'd at function exit
             buf_ptr = <char*><size_t>np.PyArray_DATA(buf)
-            with self.lock:
-                # We trick gcc into providing a specialized implementation for
-                # the most common case, yielding a ~33% performance improvement.
-                # Note that apparently, only one branch can ever be specialized.
-                if itemsize == sizeof(np.npy_intp):
-                    self._shuffle_raw(n, 1, sizeof(np.npy_intp), stride, x_ptr, buf_ptr)
-                else:
-                    self._shuffle_raw(n, 1, itemsize, stride, x_ptr, buf_ptr)
+            if x.dtype.hasobject:
+                with self.lock:
+                    _shuffle_raw_wrap(&self._bitgen, n, 1, itemsize, stride,
+                                      x_ptr, buf_ptr)
+            else:
+                # Same as above, but the GIL is released.
+                with self.lock, nogil:
+                    _shuffle_raw_wrap(&self._bitgen, n, 1, itemsize, stride,
+                                      x_ptr, buf_ptr)
         elif isinstance(x, np.ndarray) and x.ndim and x.size:
             x = np.swapaxes(x, 0, axis)
             buf = np.empty_like(x[0, ...])
@@ -4204,56 +4439,6 @@ cdef class Generator:
                     j = random_interval(&self._bitgen, i)
                     x[i], x[j] = x[j], x[i]
 
-    cdef inline _shuffle_raw(self, np.npy_intp n, np.npy_intp first,
-                             np.npy_intp itemsize, np.npy_intp stride,
-                             char* data, char* buf):
-        """
-        Parameters
-        ----------
-        n
-            Number of elements in data
-        first
-            First observation to shuffle.  Shuffles n-1,
-            n-2, ..., first, so that when first=1 the entire
-            array is shuffled
-        itemsize
-            Size in bytes of item
-        stride
-            Array stride
-        data
-            Location of data
-        buf
-            Location of buffer (itemsize)
-        """
-        cdef np.npy_intp i, j
-        for i in reversed(range(first, n)):
-            j = random_interval(&self._bitgen, i)
-            string.memcpy(buf, data + j * stride, itemsize)
-            string.memcpy(data + j * stride, data + i * stride, itemsize)
-            string.memcpy(data + i * stride, buf, itemsize)
-
-    cdef inline void _shuffle_int(self, np.npy_intp n, np.npy_intp first,
-                             int64_t* data) nogil:
-        """
-        Parameters
-        ----------
-        n
-            Number of elements in data
-        first
-            First observation to shuffle.  Shuffles n-1,
-            n-2, ..., first, so that when first=1 the entire
-            array is shuffled
-        data
-            Location of data
-        """
-        cdef np.npy_intp i, j
-        cdef int64_t temp
-        for i in reversed(range(first, n)):
-            j = random_bounded_uint64(&self._bitgen, 0, i, 0, 0)
-            temp = data[j]
-            data[j] = data[i]
-            data[i] = temp
-
     def permutation(self, object x, axis=0):
         """
         permutation(x, axis=0)
@@ -4336,7 +4521,7 @@ def default_rng(seed=None):
         unpredictable entropy will be pulled from the OS. If an ``int`` or
         ``array_like[ints]`` is passed, then it will be passed to
         `SeedSequence` to derive the initial `BitGenerator` state. One may also
-        pass in a`SeedSequence` instance
+        pass in a `SeedSequence` instance.
         Additionally, when passed a `BitGenerator`, it will be wrapped by
         `Generator`. If passed a `Generator`, it will be returned unaltered.