17 files changed, 349 insertions, 115 deletions
diff --git a/numpy/core/code_generators/cversions.txt b/numpy/core/code_generators/cversions.txt
index 38ee4dac2..f0a128d3d 100644
--- a/numpy/core/code_generators/cversions.txt
+++ b/numpy/core/code_generators/cversions.txt
@@ -59,4 +59,4 @@
 0x0000000e = 17a0f366e55ec05e5c5c149123478452
 
 # Version 15 (NumPy 1.22) Configurable memory allocations
-0x0000000f = 0c420aed67010594eb81f23ddfb02a88
+0x0000000f = b8783365b873681cd204be50cdfb448d
diff --git a/numpy/core/code_generators/numpy_api.py b/numpy/core/code_generators/numpy_api.py
index 3813c6ad7..d12d62d8f 100644
--- a/numpy/core/code_generators/numpy_api.py
+++ b/numpy/core/code_generators/numpy_api.py
@@ -19,6 +19,7 @@ from code_generators.genapi import StealRef, NonNull
 multiarray_global_vars = {
     'NPY_NUMUSERTYPES':             (7, 'int'),
     'NPY_DEFAULT_ASSIGN_CASTING':   (292, 'NPY_CASTING'),
+    'PyDataMem_DefaultHandler':     (306, 'PyObject*'),
 }
 
 multiarray_scalar_bool_values = {
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 57cc592b9..88794ca07 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -356,14 +356,32 @@ typedef unsigned long npy_ulonglong;
 typedef unsigned char npy_bool;
 #define NPY_FALSE 0
 #define NPY_TRUE 1
-
-
+/*
+ * `NPY_SIZEOF_LONGDOUBLE` isn't usually equal to sizeof(long double).
+ * In some certain cases, it may forced to be equal to sizeof(double)
+ * even against the compiler implementation and the same goes for
+ * `complex long double`.
+ *
+ * Therefore, avoid `long double`, use `npy_longdouble` instead,
+ * and when it comes to standard math functions make sure of using
+ * the double version when `NPY_SIZEOF_LONGDOUBLE` == `NPY_SIZEOF_DOUBLE`.
+ * For example:
+ *   npy_longdouble *ptr, x;
+ *   #if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+ *       npy_longdouble r = modf(x, ptr);
+ *   #else
+ *       npy_longdouble r = modfl(x, ptr);
+ *   #endif
+ *
+ * See https://github.com/numpy/numpy/issues/20348
+ */
 #if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
-        #define NPY_LONGDOUBLE_FMT "g"
+    #define NPY_LONGDOUBLE_FMT "g"
+    typedef double npy_longdouble;
 #else
-        #define NPY_LONGDOUBLE_FMT "Lg"
+    #define NPY_LONGDOUBLE_FMT "Lg"
+    typedef long double npy_longdouble;
 #endif
-typedef long double npy_longdouble;
 
 #ifndef Py_USING_UNICODE
 #error Must use Python with unicode enabled.
diff --git a/numpy/core/src/multiarray/_multiarray_tests.c.src b/numpy/core/src/multiarray/_multiarray_tests.c.src
index e945d0771..d7dfa4829 100644
--- a/numpy/core/src/multiarray/_multiarray_tests.c.src
+++ b/numpy/core/src/multiarray/_multiarray_tests.c.src
@@ -2193,7 +2193,7 @@ PrintFloat_Printf_g(PyObject *obj, int precision)
     }
     else if (PyArray_IsScalar(obj, LongDouble)) {
         npy_longdouble x = PyArrayScalar_VAL(obj, LongDouble);
-        PyOS_snprintf(str, sizeof(str), "%.*Lg", precision, x);
+        PyOS_snprintf(str, sizeof(str), "%.*" NPY_LONGDOUBLE_FMT, precision, x);
     }
     else{
         double val = PyFloat_AsDouble(obj);
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
index d1173410d..0a694cf62 100644
--- a/numpy/core/src/multiarray/alloc.c
+++ b/numpy/core/src/multiarray/alloc.c
@@ -379,6 +379,8 @@ PyDataMem_Handler default_handler = {
         default_free     /* free */
     }
 };
+/* singleton capsule of the default handler */
+PyObject *PyDataMem_DefaultHandler;
 
 #if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
 PyObject *current_handler;
@@ -519,16 +521,9 @@ PyDataMem_SetHandler(PyObject *handler)
         return NULL;
     }
     if (handler == NULL) {
-        handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
-        if (handler == NULL) {
-            return NULL;
-        }
-    }
-    else {
-        Py_INCREF(handler);
+        handler = PyDataMem_DefaultHandler;
     }
     token = PyContextVar_Set(current_handler, handler);
-    Py_DECREF(handler);
     if (token == NULL) {
         Py_DECREF(old_handler);
         return NULL;
@@ -543,26 +538,13 @@ PyDataMem_SetHandler(PyObject *handler)
     }
     old_handler = PyDict_GetItemString(p, "current_allocator");
     if (old_handler == NULL) {
-        old_handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
-        if (old_handler == NULL) {
-            return NULL;
-        }
-    }
-    else {
-        Py_INCREF(old_handler);
+        old_handler = PyDataMem_DefaultHandler
     }
+    Py_INCREF(old_handler);
     if (handler == NULL) {
-        handler = PyCapsule_New(&default_handler, "mem_handler", NULL);
-        if (handler == NULL) {
-            Py_DECREF(old_handler);
-            return NULL;
-        }
-    }
-    else {
-        Py_INCREF(handler);
+        handler = PyDataMem_DefaultHandler;
     }
     const int error = PyDict_SetItemString(p, "current_allocator", handler);
-    Py_DECREF(handler);
     if (error) {
         Py_DECREF(old_handler);
         return NULL;
diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
index f1ccf0bcd..13c828458 100644
--- a/numpy/core/src/multiarray/alloc.h
+++ b/numpy/core/src/multiarray/alloc.h
@@ -40,9 +40,9 @@ npy_free_cache_dim_array(PyArrayObject * arr)
     npy_free_cache_dim(PyArray_DIMS(arr), PyArray_NDIM(arr));
 }
 
+extern PyDataMem_Handler default_handler;
 #if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
 extern PyObject *current_handler; /* PyContextVar/PyCapsule */
-extern PyDataMem_Handler default_handler;
 #endif
 
 NPY_NO_EXPORT PyObject *
diff --git a/numpy/core/src/multiarray/descriptor.c b/numpy/core/src/multiarray/descriptor.c
index fd2577bc6..0c539053c 100644
--- a/numpy/core/src/multiarray/descriptor.c
+++ b/numpy/core/src/multiarray/descriptor.c
@@ -1326,7 +1326,7 @@ _convert_from_dict(PyObject *obj, int align)
             goto fail;
         }
         /* If align is set, make sure the alignment divides into the size */
-        if (align && itemsize % new->alignment != 0) {
+        if (align && new->alignment > 0 && itemsize % new->alignment != 0) {
             PyErr_Format(PyExc_ValueError,
                     "NumPy dtype descriptor requires alignment of %d bytes, "
                     "which is not divisible into the specified itemsize %d",
diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
index a854bcb3b..1520ff7ce 100644
--- a/numpy/core/src/multiarray/multiarraymodule.c
+++ b/numpy/core/src/multiarray/multiarraymodule.c
@@ -4919,16 +4919,19 @@ PyMODINIT_FUNC PyInit__multiarray_umath(void) {
     if (initumath(m) != 0) {
         goto err;
     }
-#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
     /*
-     * Initialize the context-local PyDataMem_Handler capsule.
+     * Initialize the default PyDataMem_Handler capsule singleton.
      */
-    c_api = PyCapsule_New(&default_handler, "mem_handler", NULL);
-    if (c_api == NULL) {
+    PyDataMem_DefaultHandler = PyCapsule_New(&default_handler, "mem_handler", NULL);
+    if (PyDataMem_DefaultHandler == NULL) {
         goto err;
     }
-    current_handler = PyContextVar_New("current_allocator", c_api);
-    Py_DECREF(c_api);
+#if (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x07030600)
+    /*
+     * Initialize the context-local current handler
+     * with the default PyDataMem_Handler capsule.
+    */
+    current_handler = PyContextVar_New("current_allocator", PyDataMem_DefaultHandler);
     if (current_handler == NULL) {
         goto err;
     }
diff --git a/numpy/core/src/npymath/npy_math_internal.h.src b/numpy/core/src/npymath/npy_math_internal.h.src
index dd2424db8..5b418342f 100644
--- a/numpy/core/src/npymath/npy_math_internal.h.src
+++ b/numpy/core/src/npymath/npy_math_internal.h.src
@@ -477,10 +477,16 @@ NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
 
 /**begin repeat
  * #type = npy_longdouble, npy_double, npy_float#
+ * #TYPE = LONGDOUBLE, DOUBLE, FLOAT#
  * #c = l,,f#
  * #C = L,,F#
  */
-
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_@TYPE@ == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, @c@)
+#endif
 /*
  * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
  * raise "invalid" when given INFINITY as input.
@@ -506,7 +512,7 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
         return (x - x);
     }
 #endif
-    return @kind@@c@(x);
+    return NPY__FP_SFX(@kind@)(x);
 }
 #endif
 
@@ -521,7 +527,7 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x)
 #ifdef HAVE_@KIND@@C@
 NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 {
-    return @kind@@c@(x, y);
+    return NPY__FP_SFX(@kind@)(x, y);
 }
 #endif
 /**end repeat1**/
@@ -529,21 +535,21 @@ NPY_INPLACE @type@ npy_@kind@@c@(@type@ x, @type@ y)
 #ifdef HAVE_MODF@C@
 NPY_INPLACE @type@ npy_modf@c@(@type@ x, @type@ *iptr)
 {
-    return modf@c@(x, iptr);
+    return NPY__FP_SFX(modf)(x, iptr);
 }
 #endif
 
 #ifdef HAVE_LDEXP@C@
 NPY_INPLACE @type@ npy_ldexp@c@(@type@ x, int exp)
 {
-    return ldexp@c@(x, exp);
+    return NPY__FP_SFX(ldexp)(x, exp);
 }
 #endif
 
 #ifdef HAVE_FREXP@C@
 NPY_INPLACE @type@ npy_frexp@c@(@type@ x, int* exp)
 {
-    return frexp@c@(x, exp);
+    return NPY__FP_SFX(frexp)(x, exp);
 }
 #endif
 
@@ -566,10 +572,10 @@ NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
 #else
 NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
 {
-    return cbrt@c@(x);
+    return NPY__FP_SFX(cbrt)(x);
 }
 #endif
-
+#undef NPY__FP_SFX
 /**end repeat**/
 
 
@@ -579,10 +585,16 @@ NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
 
 /**begin repeat
  * #type = npy_float, npy_double, npy_longdouble#
+ * #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
  * #c = f, ,l#
  * #C = F, ,L#
  */
-
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_@TYPE@ == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, @c@)
+#endif
 @type@ npy_heaviside@c@(@type@ x, @type@ h0)
 {
     if (npy_isnan(x)) {
@@ -599,10 +611,10 @@ NPY_INPLACE @type@ npy_cbrt@c@(@type@ x)
     }
 }
 
-#define LOGE2    NPY_LOGE2@c@
-#define LOG2E    NPY_LOG2E@c@
-#define RAD2DEG  (180.0@c@/NPY_PI@c@)
-#define DEG2RAD  (NPY_PI@c@/180.0@c@)
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
 
 NPY_INPLACE @type@ npy_rad2deg@c@(@type@ x)
 {
@@ -756,7 +768,7 @@ npy_divmod@c@(@type@ a, @type@ b, @type@ *modulus)
 #undef LOG2E
 #undef RAD2DEG
 #undef DEG2RAD
-
+#undef NPY__FP_SFX
 /**end repeat**/
 
 /**begin repeat
diff --git a/numpy/core/src/umath/svml b/numpy/core/src/umath/svml
-Subproject 9f8af767ed6c75455d9a382af829048f8dd1806
+Subproject 1c5260a61e7dce6be48073dfa96291edb0a11d7
diff --git a/numpy/core/tests/test_dtype.py b/numpy/core/tests/test_dtype.py
index 8fe859919..e49604e4d 100644
--- a/numpy/core/tests/test_dtype.py
+++ b/numpy/core/tests/test_dtype.py
@@ -627,6 +627,12 @@ class TestSubarray:
         t2 = np.dtype('2i4', align=True)
         assert_equal(t1.alignment, t2.alignment)
 
+    def test_aligned_empty(self):
+        # Mainly regression test for gh-19696: construction failed completely
+        dt = np.dtype([], align=True)
+        assert dt == np.dtype([])
+        dt = np.dtype({"names": [], "formats": [], "itemsize": 0}, align=True)
+        assert dt == np.dtype([])
 
 def iter_struct_object_dtypes():
     """
@@ -723,26 +729,30 @@ class TestStructuredObjectRefcounting:
     def test_structured_object_indexing(self, shape, index, items_changed,
                                         dt, pat, count, singleton):
         """Structured object reference counting for advanced indexing."""
-        zero = 0
-        one = 1
+        # Use two small negative values (should be singletons, but less likely
+        # to run into race-conditions).  This failed in some threaded envs
+        # When using 0 and 1.  If it fails again, should remove all explicit
+        # checks, and rely on `pytest-leaks` reference count checker only.
+        val0 = -4
+        val1 = -5
 
-        arr = np.zeros(shape, dt)
+        arr = np.full(shape, val0, dt)
 
         gc.collect()
-        before_zero = sys.getrefcount(zero)
-        before_one = sys.getrefcount(one)
+        before_val0 = sys.getrefcount(val0)
+        before_val1 = sys.getrefcount(val1)
         # Test item getting:
         part = arr[index]
-        after_zero = sys.getrefcount(zero)
-        assert after_zero - before_zero == count * items_changed
+        after_val0 = sys.getrefcount(val0)
+        assert after_val0 - before_val0 == count * items_changed
         del part
         # Test item setting:
-        arr[index] = one
+        arr[index] = val1
         gc.collect()
-        after_zero = sys.getrefcount(zero)
-        after_one = sys.getrefcount(one)
-        assert before_zero - after_zero == count * items_changed
-        assert after_one - before_one == count * items_changed
+        after_val0 = sys.getrefcount(val0)
+        after_val1 = sys.getrefcount(val1)
+        assert before_val0 - after_val0 == count * items_changed
+        assert after_val1 - before_val1 == count * items_changed
 
     @pytest.mark.parametrize(['dt', 'pat', 'count', 'singleton'],
                              iter_struct_object_dtypes())
diff --git a/numpy/core/tests/test_mem_policy.py b/numpy/core/tests/test_mem_policy.py
index abf340062..3dae36d5a 100644
--- a/numpy/core/tests/test_mem_policy.py
+++ b/numpy/core/tests/test_mem_policy.py
@@ -19,6 +19,10 @@ def get_module(tmp_path):
     if sys.platform.startswith('cygwin'):
         pytest.skip('link fails on cygwin')
     functions = [
+        ("get_default_policy", "METH_NOARGS", """
+             Py_INCREF(PyDataMem_DefaultHandler);
+             return PyDataMem_DefaultHandler;
+         """),
         ("set_secret_data_policy", "METH_NOARGS", """
              PyObject *secret_data =
                  PyCapsule_New(&secret_data_handler, "mem_handler", NULL);
@@ -37,11 +41,7 @@ def get_module(tmp_path):
              else {
                  old = PyDataMem_SetHandler(NULL);
              }
-             if (old == NULL) {
-                 return NULL;
-             }
-             Py_DECREF(old);
-             Py_RETURN_NONE;
+             return old;
          """),
         ("get_array", "METH_NOARGS", """
             char *buf = (char *)malloc(20);
@@ -238,6 +238,27 @@ def test_set_policy(get_module):
         assert get_handler_name() == orig_policy_name
 
 
+def test_default_policy_singleton(get_module):
+    get_handler_name = np.core.multiarray.get_handler_name
+
+    # set the policy to default
+    orig_policy = get_module.set_old_policy(None)
+
+    assert get_handler_name() == 'default_allocator'
+
+    # re-set the policy to default
+    def_policy_1 = get_module.set_old_policy(None)
+
+    assert get_handler_name() == 'default_allocator'
+
+    # set the policy to original
+    def_policy_2 = get_module.set_old_policy(orig_policy)
+
+    # since default policy is a singleton,
+    # these should be the same object
+    assert def_policy_1 is def_policy_2 is get_module.get_default_policy()
+
+
 def test_policy_propagation(get_module):
     # The memory policy goes hand-in-hand with flags.owndata
 
diff --git a/numpy/distutils/log.py b/numpy/distutils/log.py
index a8113b9c6..3347f56d6 100644
--- a/numpy/distutils/log.py
+++ b/numpy/distutils/log.py
@@ -87,3 +87,25 @@ _global_color_map = {
 
 # don't use INFO,.. flags in set_verbosity, these flags are for set_threshold.
 set_verbosity(0, force=True)
+
+
+_error = error
+_warn = warn
+_info = info
+_debug = debug
+
+
+def error(msg, *a, **kw):
+    _error(f"ERROR: {msg}", *a, **kw)
+
+
+def warn(msg, *a, **kw):
+    _warn(f"WARN: {msg}", *a, **kw)
+
+
+def info(msg, *a, **kw):
+    _info(f"INFO: {msg}", *a, **kw)
+
+
+def debug(msg, *a, **kw):
+    _debug(f"DEBUG: {msg}", *a, **kw)
diff --git a/numpy/distutils/tests/test_log.py b/numpy/distutils/tests/test_log.py
new file mode 100644
index 000000000..36f49f592
--- /dev/null
+++ b/numpy/distutils/tests/test_log.py
@@ -0,0 +1,32 @@
+import io
+import re
+from contextlib import redirect_stdout
+
+import pytest
+
+from numpy.distutils import log
+
+
+def setup_module():
+    log.set_verbosity(2, force=True)  # i.e. DEBUG
+
+
+def teardown_module():
+    log.set_verbosity(0, force=True)  # the default
+
+
+r_ansi = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+
+
+@pytest.mark.parametrize("func_name", ["error", "warn", "info", "debug"])
+def test_log_prefix(func_name):
+    func = getattr(log, func_name)
+    msg = f"{func_name} message"
+    f = io.StringIO()
+    with redirect_stdout(f):
+        func(msg)
+    out = f.getvalue()
+    assert out  # sanity check
+    clean_out = r_ansi.sub("", out)
+    line = next(line for line in clean_out.splitlines())
+    assert line == f"{func_name.upper()}: {msg}"
diff --git a/numpy/random/_generator.pyi b/numpy/random/_generator.pyi
index 64b683d7c..c574bef9a 100644
--- a/numpy/random/_generator.pyi
+++ b/numpy/random/_generator.pyi
@@ -623,7 +623,9 @@ class Generator:
         method: Literal["svd", "eigh", "cholesky"] = ...,
     ) -> ndarray[Any, dtype[float64]]: ...
     def multinomial(
-        self, n: _ArrayLikeInt_co, pvals: _ArrayLikeFloat_co, size: Optional[_ShapeLike] = ...
+        self, n: _ArrayLikeInt_co,
+            pvals: _ArrayLikeFloat_co,
+            size: Optional[_ShapeLike] = ...
     ) -> ndarray[Any, dtype[int64]]: ...
     def multivariate_hypergeometric(
         self,
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index 7087b6e1d..2134f0e24 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -3683,24 +3683,35 @@ cdef class Generator:
         ----------
         n : int or array-like of ints
             Number of experiments.
-        pvals : sequence of floats, length p
-            Probabilities of each of the ``p`` different outcomes.  These
-            must sum to 1 (however, the last element is always assumed to
-            account for the remaining probability, as long as
-            ``sum(pvals[:-1]) <= 1)``.
+        pvals : array-like of floats
+            Probabilities of each of the ``p`` different outcomes with shape
+            ``(k0, k1, ..., kn, p)``. Each element ``pvals[i,j,...,:]`` must
+            sum to 1 (however, the last element is always assumed to account
+            for the remaining probability, as long as
+            ``sum(pvals[..., :-1], axis=-1) <= 1.0``. Must have at least 1
+            dimension where pvals.shape[-1] > 0.
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
-            ``m * n * k`` samples are drawn.  Default is None, in which case a
-            single value is returned.
+            ``m * n * k`` samples are drawn each with ``p`` elements. Default
+            is None where the output size is determined by the broadcast shape
+            of ``n`` and all by the final dimension of ``pvals``, which is
+            denoted as ``b=(b0, b1, ..., bq)``. If size is not None, then it
+            must be compatible with the broadcast shape ``b``. Specifically,
+            size must have ``q`` or more elements and size[-(q-j):] must equal
+            ``bj``.
 
         Returns
         -------
         out : ndarray
-            The drawn samples, of shape *size*, if that was provided.  If not,
-            the shape is ``(N,)``.
+            The drawn samples, of shape size, if provided. When size is
+            provided, the output shape is size + (p,)  If not specified,
+            the shape is determined by the broadcast shape of ``n`` and
+            ``pvals``, ``(b0, b1, ..., bq)`` augmented with the dimension of
+            the multinomial, ``p``, so that that output shape is
+            ``(b0, b1, ..., bq, p)``.
 
-            In other words, each entry ``out[i,j,...,:]`` is an N-dimensional
-            value drawn from the distribution.
+            Each entry ``out[i,j,...,:]`` is a ``p``-dimensional value drawn
+            from the distribution.
 
         Examples
         --------
@@ -3738,6 +3749,38 @@ cdef class Generator:
         >>> rng.multinomial(100, [1/7.]*5 + [2/7.])
         array([11, 16, 14, 17, 16, 26])  # random
 
+        Simulate 10 throws of a 4-sided die and 20 throws of a 6-sided die
+
+        >>> rng.multinomial([10, 20],[[1/4]*4 + [0]*2, [1/6]*6])
+        array([[2, 1, 4, 3, 0, 0],
+               [3, 3, 3, 6, 1, 4]], dtype=int64)  # random
+
+        Generate categorical random variates from two categories where the
+        first has 3 outcomes and the second has 2.
+
+        >>> rng.multinomial(1, [[.1, .5, .4 ], [.3, .7, .0]])
+        array([[0, 0, 1],
+               [0, 1, 0]], dtype=int64)  # random
+
+        ``argmax(axis=-1)`` is then used to return the categories.
+
+        >>> pvals = [[.1, .5, .4 ], [.3, .7, .0]]
+        >>> rvs = rng.multinomial(1, pvals, size=(4,2))
+        >>> rvs.argmax(axis=-1)
+        array([[0, 1],
+               [2, 0],
+               [2, 1],
+               [2, 0]], dtype=int64)  # random
+
+        The same output dimension can be produced using broadcasting.
+
+        >>> rvs = rng.multinomial([[1]] * 4, pvals)
+        >>> rvs.argmax(axis=-1)
+        array([[0, 1],
+               [2, 0],
+               [2, 1],
+               [2, 0]], dtype=int64)  # random
+
         The probability inputs should be normalized. As an implementation
         detail, the value of the last entry is ignored and assumed to take
         up any leftover probability mass, but this should not be relied on.
@@ -3752,47 +3795,82 @@ cdef class Generator:
         >>> rng.multinomial(100, [1.0, 2.0])  # WRONG
         Traceback (most recent call last):
         ValueError: pvals < 0, pvals > 1 or pvals contains NaNs
-
         """
 
-        cdef np.npy_intp d, i, sz, offset
+        cdef np.npy_intp d, i, sz, offset, pi
         cdef np.ndarray parr, mnarr, on, temp_arr
         cdef double *pix
+        cdef int ndim
         cdef int64_t *mnix
         cdef int64_t ni
         cdef np.broadcast it
+        on = <np.ndarray>np.PyArray_FROM_OTF(n,
+                                             np.NPY_INT64,
+                                             np.NPY_ARRAY_ALIGNED |
+                                             np.NPY_ARRAY_C_CONTIGUOUS)
+        parr = <np.ndarray>np.PyArray_FROM_OTF(pvals,
+                                               np.NPY_DOUBLE,
+                                               np.NPY_ARRAY_ALIGNED |
+                                               np.NPY_ARRAY_C_CONTIGUOUS)
+        ndim = parr.ndim
+        d = parr.shape[ndim - 1] if ndim >= 1 else 0
+        if d == 0:
+            raise ValueError(
+                "pvals must have at least 1 dimension and the last dimension "
+                "of pvals must be greater than 0."
+            )
 
-        d = len(pvals)
-        on = <np.ndarray>np.PyArray_FROM_OTF(n, np.NPY_INT64, np.NPY_ALIGNED)
-        parr = <np.ndarray>np.PyArray_FROMANY(
-            pvals, np.NPY_DOUBLE, 1, 1, np.NPY_ARRAY_ALIGNED | np.NPY_ARRAY_C_CONTIGUOUS)
-        pix = <double*>np.PyArray_DATA(parr)
         check_array_constraint(parr, 'pvals', CONS_BOUNDED_0_1)
-        if kahan_sum(pix, d-1) > (1.0 + 1e-12):
-            # When floating, but not float dtype, and close, improve the error
-            # 1.0001 works for float16 and float32
-            if (isinstance(pvals, np.ndarray)
-                    and np.issubdtype(pvals.dtype, np.floating)
-                    and pvals.dtype != float
-                    and pvals.sum() < 1.0001):
-                msg = ("sum(pvals[:-1].astype(np.float64)) > 1.0. The pvals "
-                       "array is cast to 64-bit floating point prior to "
-                       "checking the sum. Precision changes when casting may "
-                       "cause problems even if the sum of the original pvals "
-                       "is valid.")
-            else:
-                msg = "sum(pvals[:-1]) > 1.0"
-            raise ValueError(msg)
+        pix = <double*>np.PyArray_DATA(parr)
+        sz = np.PyArray_SIZE(parr)
+        # Cython 0.29.20 would not correctly translate the range-based for
+        # loop to a C for loop
+        # for offset in range(<np.npy_intp>0, sz, d):
+        offset = 0
+        while offset < sz:
+            if kahan_sum(pix + offset, d-1) > (1.0 + 1e-12):
+                # When floating, but not float dtype, and close, improve the error
+                # 1.0001 works for float16 and float32
+                slice_repr = "[:-1]" if ndim == 1 else "[...,:-1]"
+                if (isinstance(pvals, np.ndarray)
+                        and np.issubdtype(pvals.dtype, np.floating)
+                        and pvals.dtype != float
+                        and pvals.sum() < 1.0001):
+                    msg = (f"sum(pvals{slice_repr}.astype(np.float64)) > 1.0."
+                           " The pvals array is cast to 64-bit floating"
+                           " point prior to checking the sum. Precision "
+                           "changes when casting may cause problems even "
+                           "if the sum of the original pvals is valid.")
+                else:
+                    msg = f"sum(pvals{slice_repr}) > 1.0"
+                raise ValueError(msg)
+            offset += d
 
-        if np.PyArray_NDIM(on) != 0: # vector
+        if np.PyArray_NDIM(on) != 0 or ndim > 1: # vector
             check_array_constraint(on, 'n', CONS_NON_NEGATIVE)
+            # This provides the offsets to use in the C-contig parr when
+            # broadcasting
+            offsets = <np.ndarray>np.arange(
+                0, np.PyArray_SIZE(parr), d, dtype=np.intp
+            ).reshape((<object>parr).shape[:ndim - 1])
             if size is None:
-                it = np.PyArray_MultiIterNew1(on)
+                it = np.PyArray_MultiIterNew2(on, offsets)
             else:
                 temp = np.empty(size, dtype=np.int8)
                 temp_arr = <np.ndarray>temp
-                it = np.PyArray_MultiIterNew2(on, temp_arr)
-                validate_output_shape(it.shape, temp_arr)
+                it = np.PyArray_MultiIterNew3(on, offsets, temp_arr)
+                # Validate size and the broadcast shape
+                try:
+                    size = (operator.index(size),)
+                except:
+                    size = tuple(size)
+                # This test verifies that an axis with dim 1 in size has not
+                # been increased by broadcasting with the input
+                if it.shape != size:
+                    raise ValueError(
+                        f"Output size {size} is not compatible with "
+                        f"broadcast dimensions of inputs {it.shape}."
+                    )
             shape = it.shape + (d,)
             multin = np.zeros(shape, dtype=np.int64)
             mnarr = <np.ndarray>multin
@@ -3802,7 +3880,8 @@ cdef class Generator:
             with self.lock, nogil:
                 for i in range(sz):
                     ni = (<int64_t*>np.PyArray_MultiIter_DATA(it, 0))[0]
-                    random_multinomial(&self._bitgen, ni, &mnix[offset], pix, d, &self._binomial)
+                    pi = (<np.npy_intp*>np.PyArray_MultiIter_DATA(it, 1))[0]
+                    random_multinomial(&self._bitgen, ni, &mnix[offset], &pix[pi], d, &self._binomial)
                     offset += d
                     np.PyArray_MultiIter_NEXT(it)
             return multin
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index d057122f1..e5411b8ef 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -136,12 +136,6 @@ class TestMultinomial:
         contig = random.multinomial(100, pvals=np.ascontiguousarray(pvals))
         assert_array_equal(non_contig, contig)
 
-    def test_multidimensional_pvals(self):
-        assert_raises(ValueError, random.multinomial, 10, [[0, 1]])
-        assert_raises(ValueError, random.multinomial, 10, [[0], [1]])
-        assert_raises(ValueError, random.multinomial, 10, [[[0], [1]], [[1], [0]]])
-        assert_raises(ValueError, random.multinomial, 10, np.array([[0, 1], [1, 0]]))
-
     def test_multinomial_pvals_float32(self):
         x = np.array([9.9e-01, 9.9e-01, 1.0e-09, 1.0e-09, 1.0e-09, 1.0e-09,
                       1.0e-09, 1.0e-09, 1.0e-09, 1.0e-09], dtype=np.float32)
@@ -2361,6 +2355,64 @@ class TestBroadcast:
                             [2, 3, 6, 4, 2, 3]], dtype=np.int64)
         assert_array_equal(actual, desired)
 
+        random = Generator(MT19937(self.seed))
+        actual = random.multinomial([5, 20], [[1 / 6.] * 6] * 2)
+        desired = np.array([[0, 0, 2, 1, 2, 0],
+                            [2, 3, 6, 4, 2, 3]], dtype=np.int64)
+        assert_array_equal(actual, desired)
+
+        random = Generator(MT19937(self.seed))
+        actual = random.multinomial([[5], [20]], [[1 / 6.] * 6] * 2)
+        desired = np.array([[[0, 0, 2, 1, 2, 0],
+                             [0, 0, 2, 1, 1, 1]],
+                            [[4, 2, 3, 3, 5, 3],
+                             [7, 2, 2, 1, 4, 4]]], dtype=np.int64)
+        assert_array_equal(actual, desired)
+
+    @pytest.mark.parametrize("n", [10,
+                                   np.array([10, 10]),
+                                   np.array([[[10]], [[10]]])
+                                   ]
+                             )
+    def test_multinomial_pval_broadcast(self, n):
+        random = Generator(MT19937(self.seed))
+        pvals = np.array([1 / 4] * 4)
+        actual = random.multinomial(n, pvals)
+        n_shape = tuple() if isinstance(n, int) else n.shape
+        expected_shape = n_shape + (4,)
+        assert actual.shape == expected_shape
+        pvals = np.vstack([pvals, pvals])
+        actual = random.multinomial(n, pvals)
+        expected_shape = np.broadcast_shapes(n_shape, pvals.shape[:-1]) + (4,)
+        assert actual.shape == expected_shape
+
+        pvals = np.vstack([[pvals], [pvals]])
+        actual = random.multinomial(n, pvals)
+        expected_shape = np.broadcast_shapes(n_shape, pvals.shape[:-1])
+        assert actual.shape == expected_shape + (4,)
+        actual = random.multinomial(n, pvals, size=(3, 2) + expected_shape)
+        assert actual.shape == (3, 2) + expected_shape + (4,)
+
+        with pytest.raises(ValueError):
+            # Ensure that size is not broadcast
+            actual = random.multinomial(n, pvals, size=(1,) * 6)
+
+    def test_invalid_pvals_broadcast(self):
+        random = Generator(MT19937(self.seed))
+        pvals = [[1 / 6] * 6, [1 / 4] * 6]
+        assert_raises(ValueError, random.multinomial, 1, pvals)
+        assert_raises(ValueError, random.multinomial, 6, 0.5)
+
+    def test_empty_outputs(self):
+        random = Generator(MT19937(self.seed))
+        actual = random.multinomial(np.empty((10, 0, 6), "i8"), [1 / 6] * 6)
+        assert actual.shape == (10, 0, 6, 6)
+        actual = random.multinomial(12, np.empty((10, 0, 10)))
+        assert actual.shape == (10, 0, 10)
+        actual = random.multinomial(np.empty((3, 0, 7), "i8"),
+                                    np.empty((3, 0, 7, 4)))
+        assert actual.shape == (3, 0, 7, 4)
+
 
 class TestThread:
     # make sure each state produces the same sequence even in threads