10 files changed, 172 insertions, 37 deletions
diff --git a/numpy/lib/format.py b/numpy/lib/format.py
index 625768b62..19fec48ed 100644
--- a/numpy/lib/format.py
+++ b/numpy/lib/format.py
@@ -186,6 +186,10 @@ _header_size_info = {
     (3, 0): ('<I', 'utf8'),
 }
 
+# Python's literal_eval is not actually safe for large inputs, since parsing
+# may become slow or even cause interpreter crashes.
+# This is an arbitrary, low limit which should make it safe in practice.
+_MAX_HEADER_SIZE = 10000
 
 def _check_version(version):
     if version not in [(1, 0), (2, 0), (3, 0), None]:
@@ -465,7 +469,7 @@ def write_array_header_2_0(fp, d):
     """
     _write_array_header(fp, d, (2, 0))
 
-def read_array_header_1_0(fp):
+def read_array_header_1_0(fp, max_header_size=_MAX_HEADER_SIZE):
     """
     Read an array header from a filelike object using the 1.0 file format
     version.
@@ -487,6 +491,10 @@ def read_array_header_1_0(fp):
         contiguous before writing it out.
     dtype : dtype
         The dtype of the file's data.
+    max_header_size : int, optional
+        Maximum allowed size of the header.  Large headers may not be safe
+        to load securely and thus require explicitly passing a larger value.
+        See :py:meth:`ast.literal_eval()` for details.
 
     Raises
     ------
@@ -494,9 +502,10 @@ def read_array_header_1_0(fp):
         If the data is invalid.
 
     """
-    return _read_array_header(fp, version=(1, 0))
+    return _read_array_header(
+            fp, version=(1, 0), max_header_size=max_header_size)
 
-def read_array_header_2_0(fp):
+def read_array_header_2_0(fp, max_header_size=_MAX_HEADER_SIZE):
     """
     Read an array header from a filelike object using the 2.0 file format
     version.
@@ -509,6 +518,10 @@ def read_array_header_2_0(fp):
     ----------
     fp : filelike object
         A file object or something with a `.read()` method like a file.
+    max_header_size : int, optional
+        Maximum allowed size of the header.  Large headers may not be safe
+        to load securely and thus require explicitly passing a larger value.
+        See :py:meth:`ast.literal_eval()` for details.
 
     Returns
     -------
@@ -527,7 +540,8 @@ def read_array_header_2_0(fp):
         If the data is invalid.
 
     """
-    return _read_array_header(fp, version=(2, 0))
+    return _read_array_header(
+            fp, version=(2, 0), max_header_size=max_header_size)
 
 
 def _filter_header(s):
@@ -565,7 +579,7 @@ def _filter_header(s):
     return tokenize.untokenize(tokens)
 
 
-def _read_array_header(fp, version):
+def _read_array_header(fp, version, max_header_size=_MAX_HEADER_SIZE):
     """
     see read_array_header_1_0
     """
@@ -581,6 +595,14 @@ def _read_array_header(fp, version):
     header_length = struct.unpack(hlength_type, hlength_str)[0]
     header = _read_bytes(fp, header_length, "array header")
     header = header.decode(encoding)
+    if len(header) > max_header_size:
+        raise ValueError(
+            f"Header info length ({len(header)}) is large and may not be safe "
+            "to load securely.\n"
+            "To allow loading, adjust `max_header_size` or fully trust "
+            "the `.npy` file using `allow_pickle=True`.\n"
+            "For safety against large resource use or crashes, sandboxing "
+            "may be necessary.")
 
     # The header is a pretty-printed string representation of a literal
     # Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
@@ -694,7 +716,8 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
                 fp.write(chunk.tobytes('C'))
 
 
-def read_array(fp, allow_pickle=False, pickle_kwargs=None):
+def read_array(fp, allow_pickle=False, pickle_kwargs=None, *,
+               max_header_size=_MAX_HEADER_SIZE):
     """
     Read an array from an NPY file.
 
@@ -713,6 +736,12 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
         Additional keyword arguments to pass to pickle.load. These are only
         useful when loading object arrays saved on Python 2 when using
         Python 3.
+    max_header_size : int, optional
+        Maximum allowed size of the header.  Large headers may not be safe
+        to load securely and thus require explicitly passing a larger value.
+        See :py:meth:`ast.literal_eval()` for details.
+        This option is ignored when `allow_pickle` is passed.  In that case
+        the file is by definition trusted and the limit is unnecessary.
 
     Returns
     -------
@@ -726,9 +755,15 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
         an object array.
 
     """
+    if allow_pickle:
+        # Effectively ignore max_header_size, since `allow_pickle` indicates
+        # that the input is fully trusted.
+        max_header_size = 2**64
+
     version = read_magic(fp)
     _check_version(version)
-    shape, fortran_order, dtype = _read_array_header(fp, version)
+    shape, fortran_order, dtype = _read_array_header(
+            fp, version, max_header_size=max_header_size)
     if len(shape) == 0:
         count = 1
     else:
@@ -788,7 +823,8 @@ def read_array(fp, allow_pickle=False, pickle_kwargs=None):
 
 
 def open_memmap(filename, mode='r+', dtype=None, shape=None,
-                fortran_order=False, version=None):
+                fortran_order=False, version=None, *,
+                max_header_size=_MAX_HEADER_SIZE):
     """
     Open a .npy file as a memory-mapped array.
 
@@ -819,6 +855,10 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         If the mode is a "write" mode, then this is the version of the file
         format used to create the file.  None means use the oldest
         supported version that is able to store the data.  Default: None
+    max_header_size : int, optional
+        Maximum allowed size of the header.  Large headers may not be safe
+        to load securely and thus require explicitly passing a larger value.
+        See :py:meth:`ast.literal_eval()` for details.
 
     Returns
     -------
@@ -866,7 +906,8 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
             version = read_magic(fp)
             _check_version(version)
 
-            shape, fortran_order, dtype = _read_array_header(fp, version)
+            shape, fortran_order, dtype = _read_array_header(
+                    fp, version, max_header_size=max_header_size)
             if dtype.hasobject:
                 msg = "Array can't be memory-mapped: Python objects in dtype."
                 raise ValueError(msg)
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index f0ec40a2e..343d76ae3 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -139,6 +139,12 @@ class NpzFile(Mapping):
         Additional keyword arguments to pass on to pickle.load.
         These are only useful when loading object arrays saved on
         Python 2 when using Python 3.
+    max_header_size : int, optional
+        Maximum allowed size of the header.  Large headers may not be safe
+        to load securely and thus require explicitly passing a larger value.
+        See :py:meth:`ast.literal_eval()` for details.
+        This option is ignored when `allow_pickle` is passed.  In that case
+        the file is by definition trusted and the limit is unnecessary.
 
     Parameters
     ----------
@@ -174,13 +180,15 @@ class NpzFile(Mapping):
     fid = None
 
     def __init__(self, fid, own_fid=False, allow_pickle=False,
-                 pickle_kwargs=None):
+                 pickle_kwargs=None, *,
+                 max_header_size=format._MAX_HEADER_SIZE):
         # Import is postponed to here since zipfile depends on gzip, an
         # optional component of the so-called standard library.
         _zip = zipfile_factory(fid)
         self._files = _zip.namelist()
         self.files = []
         self.allow_pickle = allow_pickle
+        self.max_header_size = max_header_size
         self.pickle_kwargs = pickle_kwargs
         for x in self._files:
             if x.endswith('.npy'):
@@ -244,7 +252,8 @@ class NpzFile(Mapping):
                 bytes = self.zip.open(key)
                 return format.read_array(bytes,
                                          allow_pickle=self.allow_pickle,
-                                         pickle_kwargs=self.pickle_kwargs)
+                                         pickle_kwargs=self.pickle_kwargs,
+                                         max_header_size=self.max_header_size)
             else:
                 return self.zip.read(key)
         else:
@@ -253,7 +262,7 @@ class NpzFile(Mapping):
 
 @set_module('numpy')
 def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
-         encoding='ASCII'):
+         encoding='ASCII', *, max_header_size=format._MAX_HEADER_SIZE):
     """
     Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
 
@@ -297,6 +306,12 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
         npy/npz files containing object arrays. Values other than 'latin1',
         'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
         data. Default: 'ASCII'
+    max_header_size : int, optional
+        Maximum allowed size of the header.  Large headers may not be safe
+        to load securely and thus require explicitly passing a larger value.
+        See :py:meth:`ast.literal_eval()` for details.
+        This option is ignored when `allow_pickle` is passed.  In that case
+        the file is by definition trusted and the limit is unnecessary.
 
     Returns
     -------
@@ -403,15 +418,20 @@ def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True,
             # Potentially transfer file ownership to NpzFile
             stack.pop_all()
             ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
-                          pickle_kwargs=pickle_kwargs)
+                          pickle_kwargs=pickle_kwargs,
+                          max_header_size=max_header_size)
             return ret
         elif magic == format.MAGIC_PREFIX:
             # .npy file
             if mmap_mode:
-                return format.open_memmap(file, mode=mmap_mode)
+                if allow_pickle:
+                    max_header_size = 2**64
+                return format.open_memmap(file, mode=mmap_mode,
+                                          max_header_size=max_header_size)
             else:
                 return format.read_array(fid, allow_pickle=allow_pickle,
-                                         pickle_kwargs=pickle_kwargs)
+                                         pickle_kwargs=pickle_kwargs,
+                                         max_header_size=max_header_size)
         else:
             # Try a pickle
             if not allow_pickle:
diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
index 581d067de..53d3bf1d3 100644
--- a/numpy/lib/tests/test_format.py
+++ b/numpy/lib/tests/test_format.py
@@ -459,6 +459,7 @@ def test_long_str():
     assert_array_equal(long_str_arr, long_str_arr2)
 
 
+@pytest.mark.slow
 def test_memmap_roundtrip(tmpdir):
     for i, arr in enumerate(basic_arrays + record_arrays):
         if arr.dtype.hasobject:
@@ -667,7 +668,7 @@ def test_version_2_0():
     assert_(len(header) % format.ARRAY_ALIGN == 0)
 
     f.seek(0)
-    n = format.read_array(f)
+    n = format.read_array(f, max_header_size=200000)
     assert_array_equal(d, n)
 
     # 1.0 requested but data cannot be saved this way
@@ -689,7 +690,7 @@ def test_version_2_0_memmap(tmpdir):
                             shape=d.shape, version=(2, 0))
     ma[...] = d
     ma.flush()
-    ma = format.open_memmap(tf1, mode='r')
+    ma = format.open_memmap(tf1, mode='r', max_header_size=200000)
     assert_array_equal(ma, d)
 
     with warnings.catch_warnings(record=True) as w:
@@ -700,9 +701,49 @@ def test_version_2_0_memmap(tmpdir):
         ma[...] = d
         ma.flush()
 
-    ma = format.open_memmap(tf2, mode='r')
+    ma = format.open_memmap(tf2, mode='r', max_header_size=200000)
+
     assert_array_equal(ma, d)
 
+@pytest.mark.parametrize("mmap_mode", ["r", None])
+def test_huge_header(tmpdir, mmap_mode):
+    f = os.path.join(tmpdir, f'large_header.npy')
+    arr = np.array(1, dtype="i,"*10000+"i")
+
+    with pytest.warns(UserWarning, match=".*format 2.0"):
+        np.save(f, arr)
+    
+    with pytest.raises(ValueError, match="Header.*large"):
+        np.load(f, mmap_mode=mmap_mode)
+
+    with pytest.raises(ValueError, match="Header.*large"):
+        np.load(f, mmap_mode=mmap_mode, max_header_size=20000)
+
+    res = np.load(f, mmap_mode=mmap_mode, allow_pickle=True)
+    assert_array_equal(res, arr)
+
+    res = np.load(f, mmap_mode=mmap_mode, max_header_size=180000)
+    assert_array_equal(res, arr)
+
+def test_huge_header_npz(tmpdir):
+    f = os.path.join(tmpdir, f'large_header.npz')
+    arr = np.array(1, dtype="i,"*10000+"i")
+
+    with pytest.warns(UserWarning, match=".*format 2.0"):
+        np.savez(f, arr=arr)
+    
+    # Only getting the array from the file actually reads it
+    with pytest.raises(ValueError, match="Header.*large"):
+        np.load(f)["arr"]
+
+    with pytest.raises(ValueError, match="Header.*large"):
+        np.load(f, max_header_size=20000)["arr"]
+
+    res = np.load(f, allow_pickle=True)["arr"]
+    assert_array_equal(res, arr)
+
+    res = np.load(f, max_header_size=180000)["arr"]
+    assert_array_equal(res, arr)
 
 def test_write_version():
     f = BytesIO()
diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py
index 9e4fe7ebb..831a78ec1 100644
--- a/numpy/lib/utils.py
+++ b/numpy/lib/utils.py
@@ -971,6 +971,12 @@ def safe_eval(source):
     Evaluate a string containing a Python literal expression without
     allowing the execution of arbitrary non-literal code.
 
+    .. warning::
+
+        This function is identical to :py:meth:`ast.literal_eval` and
+        has the same security implications.  It may not always be safe
+        to evaluate large input strings.
+
     Parameters
     ----------
     source : str
diff --git a/numpy/random/_common.pxd b/numpy/random/_common.pxd
index 3625634cd..3eaf39ddf 100644
--- a/numpy/random/_common.pxd
+++ b/numpy/random/_common.pxd
@@ -17,8 +17,8 @@ cdef enum ConstraintType:
     CONS_POSITIVE
     CONS_POSITIVE_NOT_NAN
     CONS_BOUNDED_0_1
-    CONS_BOUNDED_0_1_NOTNAN
     CONS_BOUNDED_GT_0_1
+    CONS_BOUNDED_LT_0_1
     CONS_GT_1
     CONS_GTE_1
     CONS_POISSON
diff --git a/numpy/random/_common.pyx b/numpy/random/_common.pyx
index 607034a38..7b6f69303 100644
--- a/numpy/random/_common.pyx
+++ b/numpy/random/_common.pyx
@@ -392,6 +392,9 @@ cdef int check_array_constraint(np.ndarray val, object name, constraint_type con
     elif cons == CONS_BOUNDED_GT_0_1:
         if not np.all(np.greater(val, 0)) or not np.all(np.less_equal(val, 1)):
             raise ValueError("{0} <= 0, {0} > 1 or {0} contains NaNs".format(name))
+    elif cons == CONS_BOUNDED_LT_0_1:
+        if not np.all(np.greater_equal(val, 0)) or not np.all(np.less(val, 1)):
+            raise ValueError("{0} < 0, {0} >= 1 or {0} contains NaNs".format(name))
     elif cons == CONS_GT_1:
         if not np.all(np.greater(val, 1)):
             raise ValueError("{0} <= 1 or {0} contains NaNs".format(name))
@@ -428,6 +431,9 @@ cdef int check_constraint(double val, object name, constraint_type cons) except
     elif cons == CONS_BOUNDED_GT_0_1:
         if not val >0 or not val <= 1:
             raise ValueError("{0} <= 0, {0} > 1 or {0} contains NaNs".format(name))
+    elif cons == CONS_BOUNDED_LT_0_1:
+        if not (val >= 0) or not (val < 1):
+            raise ValueError("{0} < 0, {0} >= 1 or {0} is NaN".format(name))
     elif cons == CONS_GT_1:
         if not (val > 1):
             raise ValueError("{0} <= 1 or {0} is NaN".format(name))
diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx
index 5218c6d0e..2c25b7191 100644
--- a/numpy/random/_generator.pyx
+++ b/numpy/random/_generator.pyx
@@ -25,7 +25,7 @@ from ._pcg64 import PCG64
 from numpy.random cimport bitgen_t
 from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
             CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1,
-            CONS_GT_1, CONS_POSITIVE_NOT_NAN, CONS_POISSON,
+            CONS_BOUNDED_LT_0_1, CONS_GT_1, CONS_POSITIVE_NOT_NAN, CONS_POISSON,
             double_fill, cont, kahan_sum, cont_broadcast_3, float_fill, cont_f,
             check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
             validate_output_shape
@@ -3437,12 +3437,12 @@ cdef class Generator:
         Draw samples from a logarithmic series distribution.
 
         Samples are drawn from a log series distribution with specified
-        shape parameter, 0 < ``p`` < 1.
+        shape parameter, 0 <= ``p`` < 1.
 
         Parameters
         ----------
         p : float or array_like of floats
-            Shape parameter for the distribution.  Must be in the range (0, 1).
+            Shape parameter for the distribution.  Must be in the range [0, 1).
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -3506,7 +3506,7 @@ cdef class Generator:
 
         """
         return disc(&random_logseries, &self._bitgen, size, self.lock, 1, 0,
-                 p, 'p', CONS_BOUNDED_0_1,
+                 p, 'p', CONS_BOUNDED_LT_0_1,
                  0.0, '', CONS_NONE,
                  0.0, '', CONS_NONE)
 
diff --git a/numpy/random/mtrand.pyx b/numpy/random/mtrand.pyx
index fcc1f27d2..ae40931d0 100644
--- a/numpy/random/mtrand.pyx
+++ b/numpy/random/mtrand.pyx
@@ -19,8 +19,8 @@ from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64,
 from ._mt19937 import MT19937 as _MT19937
 from numpy.random cimport bitgen_t
 from ._common cimport (POISSON_LAM_MAX, CONS_POSITIVE, CONS_NONE,
-            CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1, CONS_GTE_1,
-            CONS_GT_1, LEGACY_CONS_POISSON,
+            CONS_NON_NEGATIVE, CONS_BOUNDED_0_1, CONS_BOUNDED_GT_0_1,
+            CONS_BOUNDED_LT_0_1, CONS_GTE_1, CONS_GT_1, LEGACY_CONS_POISSON,
             double_fill, cont, kahan_sum, cont_broadcast_3,
             check_array_constraint, check_constraint, disc, discrete_broadcast_iii,
             validate_output_shape
@@ -3895,7 +3895,7 @@ cdef class RandomState:
         Draw samples from a logarithmic series distribution.
 
         Samples are drawn from a log series distribution with specified
-        shape parameter, 0 < ``p`` < 1.
+        shape parameter, 0 <= ``p`` < 1.
 
         .. note::
             New code should use the ``logseries`` method of a ``default_rng()``
@@ -3904,7 +3904,7 @@ cdef class RandomState:
         Parameters
         ----------
         p : float or array_like of floats
-            Shape parameter for the distribution.  Must be in the range (0, 1).
+            Shape parameter for the distribution.  Must be in the range [0, 1).
         size : int or tuple of ints, optional
             Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
             ``m * n * k`` samples are drawn.  If size is ``None`` (default),
@@ -3969,7 +3969,7 @@ cdef class RandomState:
 
         """
         out = disc(&legacy_logseries, &self._bitgen, size, self.lock, 1, 0,
-                   p, 'p', CONS_BOUNDED_0_1,
+                   p, 'p', CONS_BOUNDED_LT_0_1,
                    0.0, '', CONS_NONE,
                    0.0, '', CONS_NONE)
         # Match historical output type
diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py
index b550cd508..73d915e02 100644
--- a/numpy/random/tests/test_generator_mt19937.py
+++ b/numpy/random/tests/test_generator_mt19937.py
@@ -1363,10 +1363,22 @@ class TestRandomDist:
                             [5, 1]])
         assert_array_equal(actual, desired)
 
-    def test_logseries_exceptions(self):
-        with np.errstate(invalid='ignore'):
-            assert_raises(ValueError, random.logseries, np.nan)
-            assert_raises(ValueError, random.logseries, [np.nan] * 10)
+    def test_logseries_zero(self):
+        random = Generator(MT19937(self.seed))
+        assert random.logseries(0) == 1
+
+    @pytest.mark.parametrize("value", [np.nextafter(0., -1), 1., np.nan, 5.])
+    def test_logseries_exceptions(self, value):
+        random = Generator(MT19937(self.seed))
+        with np.errstate(invalid="ignore"):
+            with pytest.raises(ValueError):
+                random.logseries(value)
+            with pytest.raises(ValueError):
+                # contiguous path:
+                random.logseries(np.array([value] * 10))
+            with pytest.raises(ValueError):
+                # non-contiguous path:
+                random.logseries(np.array([value] * 10)[::2])
 
     def test_multinomial(self):
         random = Generator(MT19937(self.seed))
diff --git a/numpy/random/tests/test_randomstate.py b/numpy/random/tests/test_randomstate.py
index 22b167224..c0e42ec1e 100644
--- a/numpy/random/tests/test_randomstate.py
+++ b/numpy/random/tests/test_randomstate.py
@@ -942,11 +942,20 @@ class TestRandomDist:
                             [3, 6]])
         assert_array_equal(actual, desired)
 
-    def test_logseries_exceptions(self):
-        with suppress_warnings() as sup:
-            sup.record(RuntimeWarning)
-            assert_raises(ValueError, random.logseries, np.nan)
-            assert_raises(ValueError, random.logseries, [np.nan] * 10)
+    def test_logseries_zero(self):
+        assert random.logseries(0) == 1
+
+    @pytest.mark.parametrize("value", [np.nextafter(0., -1), 1., np.nan, 5.])
+    def test_logseries_exceptions(self, value):
+        with np.errstate(invalid="ignore"):
+            with pytest.raises(ValueError):
+                random.logseries(value)
+            with pytest.raises(ValueError):
+                # contiguous path:
+                random.logseries(np.array([value] * 10))
+            with pytest.raises(ValueError):
+                # non-contiguous path:
+                random.logseries(np.array([value] * 10)[::2])
 
     def test_multinomial(self):
         random.seed(self.seed)