1 files changed, 112 insertions, 21 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 0632ba1f8..ec89397a0 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -164,6 +164,12 @@ class NpzFile(object):
     f : BagObj instance
         An object on which attribute can be performed as an alternative
         to getitem access on the `NpzFile` instance itself.
+    allow_pickle : bool, optional
+        Allow loading pickled data. Default: True
+    pickle_kwargs : dict, optional
+        Additional keyword arguments to pass on to pickle.load.
+        These are only useful when loading object arrays saved on
+        Python 2 when using Python 3.
 
     Parameters
     ----------
@@ -195,12 +201,15 @@ class NpzFile(object):
 
     """
 
-    def __init__(self, fid, own_fid=False):
+    def __init__(self, fid, own_fid=False, allow_pickle=True,
+                 pickle_kwargs=None):
         # Import is postponed to here since zipfile depends on gzip, an
         # optional component of the so-called standard library.
         _zip = zipfile_factory(fid)
         self._files = _zip.namelist()
         self.files = []
+        self.allow_pickle = allow_pickle
+        self.pickle_kwargs = pickle_kwargs
         for x in self._files:
             if x.endswith('.npy'):
                 self.files.append(x[:-4])
@@ -256,7 +265,9 @@ class NpzFile(object):
             bytes.close()
             if magic == format.MAGIC_PREFIX:
                 bytes = self.zip.open(key)
-                return format.read_array(bytes)
+                return format.read_array(bytes,
+                                         allow_pickle=self.allow_pickle,
+                                         pickle_kwargs=self.pickle_kwargs)
             else:
                 return self.zip.read(key)
         else:
@@ -289,7 +300,8 @@ class NpzFile(object):
         return self.files.__contains__(key)
 
 
-def load(file, mmap_mode=None):
+def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
+         encoding='ASCII'):
     """
     Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
 
@@ -306,6 +318,23 @@ def load(file, mmap_mode=None):
         and sliced like any ndarray.  Memory mapping is especially useful
         for accessing small fragments of large files without reading the
         entire file into memory.
+    allow_pickle : bool, optional
+        Allow loading pickled object arrays stored in npy files. Reasons for
+        disallowing pickles include security, as loading pickled data can
+        execute arbitrary code. If pickles are disallowed, loading object
+        arrays will fail.
+        Default: True
+    fix_imports : bool, optional
+        Only useful when loading Python 2 generated pickled files on Python 3,
+        which includes npy/npz files containing object arrays. If `fix_imports`
+        is True, pickle will try to map the old Python 2 names to the new names
+        used in Python 3.
+    encoding : str, optional
+        What encoding to use when reading Python 2 strings. Only useful when
+        loading Python 2 generated pickled files on Python 3, which includes
+        npy/npz files containing object arrays. Values other than 'latin1',
+        'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
+        data. Default: 'ASCII'
 
     Returns
     -------
@@ -317,6 +346,8 @@ def load(file, mmap_mode=None):
     ------
     IOError
         If the input file does not exist or cannot be read.
+    ValueError
+        The file contains an object array, but allow_pickle=False given.
 
     See Also
     --------
@@ -381,6 +412,26 @@ def load(file, mmap_mode=None):
     else:
         fid = file
 
+    if encoding not in ('ASCII', 'latin1', 'bytes'):
+        # The 'encoding' value for pickle also affects what encoding
+        # the serialized binary data of Numpy arrays is loaded
+        # in. Pickle does not pass on the encoding information to
+        # Numpy. The unpickling code in numpy.core.multiarray is
+        # written to assume that unicode data appearing where binary
+        # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'.
+        #
+        # Other encoding values can corrupt binary data, and we
+        # purposefully disallow them. For the same reason, the errors=
+        # argument is not exposed, as values other than 'strict'
+        # result can similarly silently corrupt numerical data.
+        raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'")
+
+    if sys.version_info[0] >= 3:
+        pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports)
+    else:
+        # Nothing to do on Python 2
+        pickle_kwargs = {}
+
     try:
         # Code to distinguish from NumPy binary files and pickles.
         _ZIP_PREFIX = asbytes('PK\x03\x04')
@@ -392,17 +443,22 @@ def load(file, mmap_mode=None):
             # Transfer file ownership to NpzFile
             tmp = own_fid
             own_fid = False
-            return NpzFile(fid, own_fid=tmp)
+            return NpzFile(fid, own_fid=tmp, allow_pickle=allow_pickle,
+                           pickle_kwargs=pickle_kwargs)
         elif magic == format.MAGIC_PREFIX:
             # .npy file
             if mmap_mode:
                 return format.open_memmap(file, mode=mmap_mode)
             else:
-                return format.read_array(fid)
+                return format.read_array(fid, allow_pickle=allow_pickle,
+                                         pickle_kwargs=pickle_kwargs)
         else:
             # Try a pickle
+            if not allow_pickle:
+                raise ValueError("allow_pickle=False, but file does not contain "
+                                 "non-pickled data")
             try:
-                return pickle.load(fid)
+                return pickle.load(fid, **pickle_kwargs)
             except:
                 raise IOError(
                     "Failed to interpret file %s as a pickle" % repr(file))
@@ -411,7 +467,7 @@ def load(file, mmap_mode=None):
             fid.close()
 
 
-def save(file, arr):
+def save(file, arr, allow_pickle=True, fix_imports=True):
     """
     Save an array to a binary file in NumPy ``.npy`` format.
 
@@ -422,6 +478,19 @@ def save(file, arr):
         then the filename is unchanged.  If file is a string, a ``.npy``
         extension will be appended to the file name if it does not already
         have one.
+    allow_pickle : bool, optional
+        Allow saving object arrays using Python pickles. Reasons for disallowing
+        pickles include security (loading pickled data can execute arbitrary
+        code) and portability (pickled objects may not be loadable on different
+        Python installations, for example if the stored objects require libraries
+        that are not available, and not all pickled data is compatible between
+        Python 2 and Python 3).
+        Default: True
+    fix_imports : bool, optional
+        Only useful in forcing objects in object arrays on Python 3 to be
+        pickled in a Python 2 compatible way. If `fix_imports` is True, pickle
+        will try to map the new Python 3 names to the old module names used in
+        Python 2, so that the pickle data stream is readable with Python 2.
     arr : array_like
         Array data to be saved.
 
@@ -458,9 +527,16 @@ def save(file, arr):
     else:
         fid = file
 
+    if sys.version_info[0] >= 3:
+        pickle_kwargs = dict(fix_imports=fix_imports)
+    else:
+        # Nothing to do on Python 2
+        pickle_kwargs = None
+
     try:
         arr = np.asanyarray(arr)
-        format.write_array(fid, arr)
+        format.write_array(fid, arr, allow_pickle=allow_pickle,
+                           pickle_kwargs=pickle_kwargs)
     finally:
         if own_fid:
             fid.close()
@@ -572,7 +648,7 @@ def savez_compressed(file, *args, **kwds):
     _savez(file, args, kwds, True)
 
 
-def _savez(file, args, kwds, compress):
+def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
     # Import is postponed to here since zipfile depends on gzip, an optional
     # component of the so-called standard library.
     import zipfile
@@ -606,7 +682,9 @@ def _savez(file, args, kwds, compress):
             fname = key + '.npy'
             fid = open(tmpfile, 'wb')
             try:
-                format.write_array(fid, np.asanyarray(val))
+                format.write_array(fid, np.asanyarray(val),
+                                   allow_pickle=allow_pickle,
+                                   pickle_kwargs=pickle_kwargs)
                 fid.close()
                 fid = None
                 zipf.write(tmpfile, arcname=fname)
@@ -640,7 +718,7 @@ def _getconv(dtype):
     elif issubclass(typ, np.floating):
         return floatconv
     elif issubclass(typ, np.complex):
-        return complex
+        return lambda x: complex(asstr(x))
     elif issubclass(typ, np.bytes_):
         return bytes
     else:
@@ -667,8 +745,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
         each row will be interpreted as an element of the array.  In this
         case, the number of columns used must match the number of fields in
         the data-type.
-    comments : str, optional
-        The character used to indicate the start of a comment;
+    comments : str or sequence, optional
+        The characters or list of characters used to indicate the start of a
+        comment;
         default: '#'.
     delimiter : str, optional
         The string used to separate values.  By default, this is any
@@ -741,7 +820,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
     """
     # Type conversions for Py3 convenience
     if comments is not None:
-        comments = asbytes(comments)
+        if isinstance(comments, (basestring, bytes)):
+            comments = [asbytes(comments)]
+        else:
+            comments = [asbytes(comment) for comment in comments]
+
+        # Compile regex for comments beforehand
+        comments = (re.escape(comment) for comment in comments)
+        regex_comments = re.compile(asbytes('|').join(comments))
     user_converters = converters
     if delimiter is not None:
         delimiter = asbytes(delimiter)
@@ -813,11 +899,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
             return tuple(ret)
 
     def split_line(line):
-        """Chop off comments, strip, and split at delimiter."""
-        if comments is None:
-            line = asbytes(line).strip(asbytes('\r\n'))
-        else:
-            line = asbytes(line).split(comments)[0].strip(asbytes('\r\n'))
+        """Chop off comments, strip, and split at delimiter.
+
+        Note that although the file is opened as text, this function
+        returns bytes.
+
+        """
+        line = asbytes(line)
+        if comments is not None:
+            line = regex_comments.split(asbytes(line), maxsplit=1)[0]
+        line = line.strip(asbytes('\r\n'))
         if line:
             return line.split(delimiter)
         else:
@@ -1240,8 +1331,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
         The string used to separate values.  By default, any consecutive
         whitespaces act as delimiter.  An integer or sequence of integers
         can also be provided as width(s) of each field.
-    skip_rows : int, optional
-        `skip_rows` was deprecated in numpy 1.5, and will be removed in
+    skiprows : int, optional
+        `skiprows` was deprecated in numpy 1.5, and will be removed in
         numpy 2.0. Please use `skip_header` instead.
     skip_header : int, optional
         The number of lines to skip at the beginning of the file.