diff options
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r-- | numpy/lib/npyio.py | 133 |
1 files changed, 112 insertions, 21 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 0632ba1f8..ec89397a0 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -164,6 +164,12 @@ class NpzFile(object): f : BagObj instance An object on which attribute can be performed as an alternative to getitem access on the `NpzFile` instance itself. + allow_pickle : bool, optional + Allow loading pickled data. Default: True + pickle_kwargs : dict, optional + Additional keyword arguments to pass on to pickle.load. + These are only useful when loading object arrays saved on + Python 2 when using Python 3. Parameters ---------- @@ -195,12 +201,15 @@ class NpzFile(object): """ - def __init__(self, fid, own_fid=False): + def __init__(self, fid, own_fid=False, allow_pickle=True, + pickle_kwargs=None): # Import is postponed to here since zipfile depends on gzip, an # optional component of the so-called standard library. _zip = zipfile_factory(fid) self._files = _zip.namelist() self.files = [] + self.allow_pickle = allow_pickle + self.pickle_kwargs = pickle_kwargs for x in self._files: if x.endswith('.npy'): self.files.append(x[:-4]) @@ -256,7 +265,9 @@ class NpzFile(object): bytes.close() if magic == format.MAGIC_PREFIX: bytes = self.zip.open(key) - return format.read_array(bytes) + return format.read_array(bytes, + allow_pickle=self.allow_pickle, + pickle_kwargs=self.pickle_kwargs) else: return self.zip.read(key) else: @@ -289,7 +300,8 @@ class NpzFile(object): return self.files.__contains__(key) -def load(file, mmap_mode=None): +def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, + encoding='ASCII'): """ Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files. @@ -306,6 +318,23 @@ def load(file, mmap_mode=None): and sliced like any ndarray. Memory mapping is especially useful for accessing small fragments of large files without reading the entire file into memory. + allow_pickle : bool, optional + Allow loading pickled object arrays stored in npy files. Reasons for + disallowing pickles include security, as loading pickled data can + execute arbitrary code. If pickles are disallowed, loading object + arrays will fail. + Default: True + fix_imports : bool, optional + Only useful when loading Python 2 generated pickled files on Python 3, + which includes npy/npz files containing object arrays. If `fix_imports` + is True, pickle will try to map the old Python 2 names to the new names + used in Python 3. + encoding : str, optional + What encoding to use when reading Python 2 strings. Only useful when + loading Python 2 generated pickled files on Python 3, which includes + npy/npz files containing object arrays. Values other than 'latin1', + 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical + data. Default: 'ASCII' Returns ------- @@ -317,6 +346,8 @@ def load(file, mmap_mode=None): ------ IOError If the input file does not exist or cannot be read. + ValueError + The file contains an object array, but allow_pickle=False given. See Also -------- @@ -381,6 +412,26 @@ def load(file, mmap_mode=None): else: fid = file + if encoding not in ('ASCII', 'latin1', 'bytes'): + # The 'encoding' value for pickle also affects what encoding + # the serialized binary data of Numpy arrays is loaded + # in. Pickle does not pass on the encoding information to + # Numpy. The unpickling code in numpy.core.multiarray is + # written to assume that unicode data appearing where binary + # should be is in 'latin1'. 'bytes' is also safe, as is 'ASCII'. + # + # Other encoding values can corrupt binary data, and we + # purposefully disallow them. For the same reason, the errors= + # argument is not exposed, as values other than 'strict' + # result can similarly silently corrupt numerical data. + raise ValueError("encoding must be 'ASCII', 'latin1', or 'bytes'") + + if sys.version_info[0] >= 3: + pickle_kwargs = dict(encoding=encoding, fix_imports=fix_imports) + else: + # Nothing to do on Python 2 + pickle_kwargs = {} + try: # Code to distinguish from NumPy binary files and pickles. _ZIP_PREFIX = asbytes('PK\x03\x04') @@ -392,17 +443,22 @@ def load(file, mmap_mode=None): # Transfer file ownership to NpzFile tmp = own_fid own_fid = False - return NpzFile(fid, own_fid=tmp) + return NpzFile(fid, own_fid=tmp, allow_pickle=allow_pickle, + pickle_kwargs=pickle_kwargs) elif magic == format.MAGIC_PREFIX: # .npy file if mmap_mode: return format.open_memmap(file, mode=mmap_mode) else: - return format.read_array(fid) + return format.read_array(fid, allow_pickle=allow_pickle, + pickle_kwargs=pickle_kwargs) else: # Try a pickle + if not allow_pickle: + raise ValueError("allow_pickle=False, but file does not contain " + "non-pickled data") try: - return pickle.load(fid) + return pickle.load(fid, **pickle_kwargs) except: raise IOError( "Failed to interpret file %s as a pickle" % repr(file)) @@ -411,7 +467,7 @@ def load(file, mmap_mode=None): fid.close() -def save(file, arr): +def save(file, arr, allow_pickle=True, fix_imports=True): """ Save an array to a binary file in NumPy ``.npy`` format. @@ -422,6 +478,19 @@ def save(file, arr): then the filename is unchanged. If file is a string, a ``.npy`` extension will be appended to the file name if it does not already have one. + allow_pickle : bool, optional + Allow saving object arrays using Python pickles. Reasons for disallowing + pickles include security (loading pickled data can execute arbitrary + code) and portability (pickled objects may not be loadable on different + Python installations, for example if the stored objects require libraries + that are not available, and not all pickled data is compatible between + Python 2 and Python 3). + Default: True + fix_imports : bool, optional + Only useful in forcing objects in object arrays on Python 3 to be + pickled in a Python 2 compatible way. If `fix_imports` is True, pickle + will try to map the new Python 3 names to the old module names used in + Python 2, so that the pickle data stream is readable with Python 2. arr : array_like Array data to be saved. @@ -458,9 +527,16 @@ def save(file, arr): else: fid = file + if sys.version_info[0] >= 3: + pickle_kwargs = dict(fix_imports=fix_imports) + else: + # Nothing to do on Python 2 + pickle_kwargs = None + try: arr = np.asanyarray(arr) - format.write_array(fid, arr) + format.write_array(fid, arr, allow_pickle=allow_pickle, + pickle_kwargs=pickle_kwargs) finally: if own_fid: fid.close() @@ -572,7 +648,7 @@ def savez_compressed(file, *args, **kwds): _savez(file, args, kwds, True) -def _savez(file, args, kwds, compress): +def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): # Import is postponed to here since zipfile depends on gzip, an optional # component of the so-called standard library. import zipfile @@ -606,7 +682,9 @@ def _savez(file, args, kwds, compress): fname = key + '.npy' fid = open(tmpfile, 'wb') try: - format.write_array(fid, np.asanyarray(val)) + format.write_array(fid, np.asanyarray(val), + allow_pickle=allow_pickle, + pickle_kwargs=pickle_kwargs) fid.close() fid = None zipf.write(tmpfile, arcname=fname) @@ -640,7 +718,7 @@ def _getconv(dtype): elif issubclass(typ, np.floating): return floatconv elif issubclass(typ, np.complex): - return complex + return lambda x: complex(asstr(x)) elif issubclass(typ, np.bytes_): return bytes else: @@ -667,8 +745,9 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. - comments : str, optional - The character used to indicate the start of a comment; + comments : str or sequence, optional + The characters or list of characters used to indicate the start of a + comment; default: '#'. delimiter : str, optional The string used to separate values. By default, this is any @@ -741,7 +820,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, """ # Type conversions for Py3 convenience if comments is not None: - comments = asbytes(comments) + if isinstance(comments, (basestring, bytes)): + comments = [asbytes(comments)] + else: + comments = [asbytes(comment) for comment in comments] + + # Compile regex for comments beforehand + comments = (re.escape(comment) for comment in comments) + regex_comments = re.compile(asbytes('|').join(comments)) user_converters = converters if delimiter is not None: delimiter = asbytes(delimiter) @@ -813,11 +899,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, return tuple(ret) def split_line(line): - """Chop off comments, strip, and split at delimiter.""" - if comments is None: - line = asbytes(line).strip(asbytes('\r\n')) - else: - line = asbytes(line).split(comments)[0].strip(asbytes('\r\n')) + """Chop off comments, strip, and split at delimiter. + + Note that although the file is opened as text, this function + returns bytes. + + """ + line = asbytes(line) + if comments is not None: + line = regex_comments.split(asbytes(line), maxsplit=1)[0] + line = line.strip(asbytes('\r\n')) if line: return line.split(delimiter) else: @@ -1240,8 +1331,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, The string used to separate values. By default, any consecutive whitespaces act as delimiter. An integer or sequence of integers can also be provided as width(s) of each field. - skip_rows : int, optional - `skip_rows` was deprecated in numpy 1.5, and will be removed in + skiprows : int, optional + `skiprows` was deprecated in numpy 1.5, and will be removed in numpy 2.0. Please use `skip_header` instead. skip_header : int, optional The number of lines to skip at the beginning of the file. |