diff options
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r-- | numpy/lib/npyio.py | 1308 |
1 files changed, 771 insertions, 537 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 0dee6b333..d6d2a0c6c 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -3,32 +3,46 @@ from __future__ import division, absolute_import, print_function import sys import os import re +import functools import itertools import warnings import weakref +import contextlib from operator import itemgetter, index as opindex import numpy as np from . import format from ._datasource import DataSource +from numpy.core import overrides from numpy.core.multiarray import packbits, unpackbits +from numpy.core.overrides import set_module +from numpy.core._internal import recursive from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, ConverterLockError, ConversionWarning, _is_string_like, - has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name + has_nested_fields, flatten_dtype, easy_dtype, _decode_line ) from numpy.compat import ( - asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path + asbytes, asstr, asunicode, bytes, basestring, os_fspath, os_PathLike, + pickle, contextlib_nullcontext ) if sys.version_info[0] >= 3: - import pickle + from collections.abc import Mapping else: - import cPickle as pickle from future_builtins import map + from collections import Mapping + + +@set_module('numpy') +def loads(*args, **kwargs): + # NumPy 1.15.0, 2017-12-10 + warnings.warn( + "np.loads is deprecated, use pickle.loads instead", + DeprecationWarning, stacklevel=2) + return pickle.loads(*args, **kwargs) -loads = pickle.loads __all__ = [ 'savetxt', 'loadtxt', 'genfromtxt', 'ndfromtxt', 'mafromtxt', @@ -37,6 +51,10 @@ __all__ = [ ] +array_function_dispatch = functools.partial( + overrides.array_function_dispatch, module='numpy') + + class BagObj(object): """ BagObj(obj) @@ -83,7 +101,7 @@ class BagObj(object): This also enables tab-completion in an interpreter or IPython. """ - return object.__getattribute__(self, '_obj').keys() + return list(object.__getattribute__(self, '_obj').keys()) def zipfile_factory(file, *args, **kwargs): @@ -94,14 +112,14 @@ def zipfile_factory(file, *args, **kwargs): pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile constructor. """ - if is_pathlib_path(file): - file = str(file) + if not hasattr(file, 'read'): + file = os_fspath(file) import zipfile kwargs['allowZip64'] = True return zipfile.ZipFile(file, *args, **kwargs) -class NpzFile(object): +class NpzFile(Mapping): """ NpzFile(fid) @@ -150,13 +168,13 @@ class NpzFile(object): >>> x = np.arange(10) >>> y = np.sin(x) >>> np.savez(outfile, x=x, y=y) - >>> outfile.seek(0) + >>> _ = outfile.seek(0) >>> npz = np.load(outfile) >>> isinstance(npz, np.lib.io.NpzFile) True - >>> npz.files - ['y', 'x'] + >>> sorted(npz.files) + ['x', 'y'] >>> npz['x'] # getitem access array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> npz.f.x # attribute lookup @@ -207,6 +225,13 @@ class NpzFile(object): def __del__(self): self.close() + # Implement the Mapping ABC + def __iter__(self): + return iter(self.files) + + def __len__(self): + return len(self.files) + def __getitem__(self, key): # FIXME: This seems like it will copy strings around # more than is strictly necessary. The zipfile @@ -216,11 +241,11 @@ class NpzFile(object): # It would be better if the zipfile could read # (or at least uncompress) the data # directly into the array memory. - member = 0 + member = False if key in self._files: - member = 1 + member = True elif key in self.files: - member = 1 + member = True key += '.npy' if member: bytes = self.zip.open(key) @@ -236,38 +261,41 @@ class NpzFile(object): else: raise KeyError("%s is not a file in the archive" % key) - def __iter__(self): - return iter(self.files) - - def items(self): - """ - Return a list of tuples, with each tuple (filename, array in file). - """ - return [(f, self[f]) for f in self.files] + if sys.version_info.major == 3: + # deprecate the python 2 dict apis that we supported by accident in + # python 3. We forgot to implement itervalues() at all in earlier + # versions of numpy, so no need to deprecated it here. - def iteritems(self): - """Generator that returns tuples (filename, array in file).""" - for f in self.files: - yield (f, self[f]) + def iteritems(self): + # Numpy 1.15, 2018-02-20 + warnings.warn( + "NpzFile.iteritems is deprecated in python 3, to match the " + "removal of dict.itertems. Use .items() instead.", + DeprecationWarning, stacklevel=2) + return self.items() - def keys(self): - """Return files in the archive with a ``.npy`` extension.""" - return self.files - - def iterkeys(self): - """Return an iterator over the files in the archive.""" - return self.__iter__() - - def __contains__(self, key): - return self.files.__contains__(key) + def iterkeys(self): + # Numpy 1.15, 2018-02-20 + warnings.warn( + "NpzFile.iterkeys is deprecated in python 3, to match the " + "removal of dict.iterkeys. Use .keys() instead.", + DeprecationWarning, stacklevel=2) + return self.keys() +@set_module('numpy') def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, encoding='ASCII'): """ Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files. + .. warning:: Loading files that contain object arrays uses the ``pickle`` + module, which is not secure against erroneous or maliciously + constructed data. Consider passing ``allow_pickle=False`` to + load data that is known not to contain object arrays for the + safer handling of untrusted sources. + Parameters ---------- file : file-like object, string, or pathlib.Path @@ -294,7 +322,7 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, used in Python 3. encoding : str, optional What encoding to use when reading Python 2 strings. Only useful when - loading Python 2 generated pickled files on Python 3, which includes + loading Python 2 generated pickled files in Python 3, which includes npy/npz files containing object arrays. Values other than 'latin1', 'ASCII', and 'bytes' are not allowed, as they can corrupt numerical data. Default: 'ASCII' @@ -365,16 +393,6 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, memmap([4, 5, 6]) """ - own_fid = False - if isinstance(file, basestring): - fid = open(file, "rb") - own_fid = True - elif is_pathlib_path(file): - fid = file.open("rb") - own_fid = True - else: - fid = file - if encoding not in ('ASCII', 'latin1', 'bytes'): # The 'encoding' value for pickle also affects what encoding # the serialized binary data of NumPy arrays is loaded @@ -395,21 +413,30 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, # Nothing to do on Python 2 pickle_kwargs = {} + # TODO: Use contextlib.ExitStack once we drop Python 2 + if hasattr(file, 'read'): + fid = file + own_fid = False + else: + fid = open(os_fspath(file), "rb") + own_fid = True + try: # Code to distinguish from NumPy binary files and pickles. - _ZIP_PREFIX = asbytes('PK\x03\x04') + _ZIP_PREFIX = b'PK\x03\x04' + _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this N = len(format.MAGIC_PREFIX) magic = fid.read(N) # If the file size is less than N, we need to make sure not # to seek past the beginning of the file fid.seek(-min(N, len(magic)), 1) # back-up - if magic.startswith(_ZIP_PREFIX): + if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX): # zip-file (assume .npz) # Transfer file ownership to NpzFile - tmp = own_fid + ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle, + pickle_kwargs=pickle_kwargs) own_fid = False - return NpzFile(fid, own_fid=tmp, allow_pickle=allow_pickle, - pickle_kwargs=pickle_kwargs) + return ret elif magic == format.MAGIC_PREFIX: # .npy file if mmap_mode: @@ -420,11 +447,11 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, else: # Try a pickle if not allow_pickle: - raise ValueError("allow_pickle=False, but file does not contain " - "non-pickled data") + raise ValueError("Cannot load file containing pickled data " + "when allow_pickle=False") try: return pickle.load(fid, **pickle_kwargs) - except: + except Exception: raise IOError( "Failed to interpret file %s as a pickle" % repr(file)) finally: @@ -432,6 +459,11 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, fid.close() +def _save_dispatcher(file, arr, allow_pickle=None, fix_imports=None): + return (arr,) + + +@array_function_dispatch(_save_dispatcher) def save(file, arr, allow_pickle=True, fix_imports=True): """ Save an array to a binary file in NumPy ``.npy`` format. @@ -443,6 +475,8 @@ def save(file, arr, allow_pickle=True, fix_imports=True): then the filename is unchanged. If file is a string or Path, a ``.npy`` extension will be appended to the file name if it does not already have one. + arr : array_like + Array data to be saved. allow_pickle : bool, optional Allow saving object arrays using Python pickles. Reasons for disallowing pickles include security (loading pickled data can execute arbitrary @@ -456,8 +490,6 @@ def save(file, arr, allow_pickle=True, fix_imports=True): pickled in a Python 2 compatible way. If `fix_imports` is True, pickle will try to map the new Python 3 names to the old module names used in Python 2, so that the pickle data stream is readable with Python 2. - arr : array_like - Array data to be saved. See Also -------- @@ -466,9 +498,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True): Notes ----- - For a description of the ``.npy`` format, see the module docstring - of `numpy.lib.format` or the NumPy Enhancement Proposal - http://docs.scipy.org/doc/numpy/neps/npy-format.html + For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`. Examples -------- @@ -478,24 +508,20 @@ def save(file, arr, allow_pickle=True, fix_imports=True): >>> x = np.arange(10) >>> np.save(outfile, x) - >>> outfile.seek(0) # Only needed here to simulate closing & reopening file + >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file >>> np.load(outfile) array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) """ own_fid = False - if isinstance(file, basestring): + if hasattr(file, 'read'): + fid = file + else: + file = os_fspath(file) if not file.endswith('.npy'): file = file + '.npy' fid = open(file, "wb") own_fid = True - elif is_pathlib_path(file): - if not file.name.endswith('.npy'): - file = file.parent / (file.name + '.npy') - fid = file.open("wb") - own_fid = True - else: - fid = file if sys.version_info[0] >= 3: pickle_kwargs = dict(fix_imports=fix_imports) @@ -512,6 +538,14 @@ def save(file, arr, allow_pickle=True, fix_imports=True): fid.close() +def _savez_dispatcher(file, *args, **kwds): + for a in args: + yield a + for v in kwds.values(): + yield v + + +@array_function_dispatch(_savez_dispatcher) def savez(file, *args, **kwds): """ Save several arrays into a single file in uncompressed ``.npz`` format. @@ -552,9 +586,7 @@ def savez(file, *args, **kwds): The ``.npz`` file format is a zipped archive of files named after the variables they contain. The archive is not compressed and each file in the archive contains one variable in ``.npy`` format. For a - description of the ``.npy`` format, see `numpy.lib.format` or the - NumPy Enhancement Proposal - http://docs.scipy.org/doc/numpy/neps/npy-format.html + description of the ``.npy`` format, see :py:mod:`numpy.lib.format`. When opening the saved ``.npz`` file with `load` a `NpzFile` object is returned. This is a dictionary-like object which can be queried for @@ -571,10 +603,10 @@ def savez(file, *args, **kwds): Using `savez` with \\*args, the arrays are saved with default names. >>> np.savez(outfile, x, y) - >>> outfile.seek(0) # Only needed here to simulate closing & reopening file + >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file >>> npzfile = np.load(outfile) >>> npzfile.files - ['arr_1', 'arr_0'] + ['arr_0', 'arr_1'] >>> npzfile['arr_0'] array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) @@ -582,10 +614,10 @@ def savez(file, *args, **kwds): >>> outfile = TemporaryFile() >>> np.savez(outfile, x=x, y=y) - >>> outfile.seek(0) + >>> _ = outfile.seek(0) >>> npzfile = np.load(outfile) - >>> npzfile.files - ['y', 'x'] + >>> sorted(npzfile.files) + ['x', 'y'] >>> npzfile['x'] array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) @@ -593,6 +625,14 @@ def savez(file, *args, **kwds): _savez(file, args, kwds, False) +def _savez_compressed_dispatcher(file, *args, **kwds): + for a in args: + yield a + for v in kwds.values(): + yield v + + +@array_function_dispatch(_savez_compressed_dispatcher) def savez_compressed(file, *args, **kwds): """ Save several arrays into a single file in compressed ``.npz`` format. @@ -633,9 +673,9 @@ def savez_compressed(file, *args, **kwds): The ``.npz`` file format is a zipped archive of files named after the variables they contain. The archive is compressed with ``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable - in ``.npy`` format. For a description of the ``.npy`` format, see - `numpy.lib.format` or the NumPy Enhancement Proposal - http://docs.scipy.org/doc/numpy/neps/npy-format.html + in ``.npy`` format. For a description of the ``.npy`` format, see + :py:mod:`numpy.lib.format`. + When opening the saved ``.npz`` file with `load` a `NpzFile` object is returned. This is a dictionary-like object which can be queried for @@ -661,15 +701,11 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): # Import is postponed to here since zipfile depends on gzip, an optional # component of the so-called standard library. import zipfile - # Import deferred for startup time improvement - import tempfile - if isinstance(file, basestring): + if not hasattr(file, 'read'): + file = os_fspath(file) if not file.endswith('.npz'): file = file + '.npz' - elif is_pathlib_path(file): - if not file.name.endswith('.npz'): - file = file.parent / (file.name + '.npz') namedict = kwds for i, val in enumerate(args): @@ -686,31 +722,44 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): zipf = zipfile_factory(file, mode="w", compression=compression) - # Stage arrays in a temporary file on disk, before writing to zip. - - # Since target file might be big enough to exceed capacity of a global - # temporary directory, create temp file side-by-side with the target file. - file_dir, file_prefix = os.path.split(file) if _is_string_like(file) else (None, 'tmp') - fd, tmpfile = tempfile.mkstemp(prefix=file_prefix, dir=file_dir, suffix='-numpy.npy') - os.close(fd) - try: + if sys.version_info >= (3, 6): + # Since Python 3.6 it is possible to write directly to a ZIP file. for key, val in namedict.items(): fname = key + '.npy' - fid = open(tmpfile, 'wb') - try: - format.write_array(fid, np.asanyarray(val), + val = np.asanyarray(val) + force_zip64 = val.nbytes >= 2**30 + with zipf.open(fname, 'w', force_zip64=force_zip64) as fid: + format.write_array(fid, val, allow_pickle=allow_pickle, pickle_kwargs=pickle_kwargs) - fid.close() - fid = None - zipf.write(tmpfile, arcname=fname) - except IOError as exc: - raise IOError("Failed to write to %s: %s" % (tmpfile, exc)) - finally: - if fid: + else: + # Stage arrays in a temporary file on disk, before writing to zip. + + # Import deferred for startup time improvement + import tempfile + # Since target file might be big enough to exceed capacity of a global + # temporary directory, create temp file side-by-side with the target file. + file_dir, file_prefix = os.path.split(file) if _is_string_like(file) else (None, 'tmp') + fd, tmpfile = tempfile.mkstemp(prefix=file_prefix, dir=file_dir, suffix='-numpy.npy') + os.close(fd) + try: + for key, val in namedict.items(): + fname = key + '.npy' + fid = open(tmpfile, 'wb') + try: + format.write_array(fid, np.asanyarray(val), + allow_pickle=allow_pickle, + pickle_kwargs=pickle_kwargs) fid.close() - finally: - os.remove(tmpfile) + fid = None + zipf.write(tmpfile, arcname=fname) + except IOError as exc: + raise IOError("Failed to write to %s: %s" % (tmpfile, exc)) + finally: + if fid: + fid.close() + finally: + os.remove(tmpfile) zipf.close() @@ -720,8 +769,8 @@ def _getconv(dtype): def floatconv(x): x.lower() - if b'0x' in x: - return float.fromhex(asstr(x)) + if '0x' in x: + return float.fromhex(x) return float(x) typ = dtype.type @@ -737,17 +786,23 @@ def _getconv(dtype): return np.longdouble elif issubclass(typ, np.floating): return floatconv - elif issubclass(typ, np.complex): - return lambda x: complex(asstr(x)) + elif issubclass(typ, complex): + return lambda x: complex(asstr(x).replace('+-', '-')) elif issubclass(typ, np.bytes_): return asbytes + elif issubclass(typ, np.unicode_): + return asunicode else: return asstr +# amount of lines loadtxt reads in one chunk, can be overridden for testing +_loadtxt_chunksize = 50000 + +@set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0): + ndmin=0, encoding='bytes', max_rows=None): """ Load data from a text file. @@ -765,33 +820,31 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. - comments : str or sequence, optional + comments : str or sequence of str, optional The characters or list of characters used to indicate the start of a - comment; - default: '#'. + comment. None implies no comments. For backwards compatibility, byte + strings will be decoded as 'latin1'. The default is '#'. delimiter : str, optional - The string used to separate values. By default, this is any - whitespace. + The string used to separate values. For backwards compatibility, byte + strings will be decoded as 'latin1'. The default is whitespace. converters : dict, optional - A dictionary mapping column number to a function that will convert - that column to a float. E.g., if column 0 is a date string: - ``converters = {0: datestr2num}``. Converters can also be used to - provide a default value for missing data (but see also `genfromtxt`): - ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None. + A dictionary mapping column number to a function that will parse the + column string into the desired value. E.g., if column 0 is a date + string: ``converters = {0: datestr2num}``. Converters can also be + used to provide a default value for missing data (but see also + `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``. + Default: None. skiprows : int, optional - Skip the first `skiprows` lines; default: 0. - + Skip the first `skiprows` lines, including comments; default: 0. usecols : int or sequence, optional Which columns to read, with 0 being the first. For example, - usecols = (1,4,5) will extract the 2nd, 5th and 6th columns. + ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. The default, None, results in all columns being read. - .. versionadded:: 1.11.0 - - Also when a single column has to be read it is possible to use - an integer instead of a tuple. E.g ``usecols = 3`` reads the - fourth column the same way as `usecols = (3,)`` would. - + .. versionchanged:: 1.11.0 + When a single column has to be read it is possible to use + an integer instead of a tuple. E.g ``usecols = 3`` reads the + fourth column the same way as ``usecols = (3,)`` would. unpack : bool, optional If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)``. When used with a structured @@ -802,6 +855,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Legal values: 0 (default), 1 or 2. .. versionadded:: 1.6.0 + encoding : str, optional + Encoding used to decode the inputfile. Does not apply to input streams. + The special value 'bytes' enables backward compatibility workarounds + that ensures you receive byte arrays as results if possible and passes + 'latin1' encoded strings to converters. Override this value to receive + unicode arrays and pass strings as input to converters. If set to None + the system default is used. The default value is 'bytes'. + + .. versionadded:: 1.14.0 + max_rows : int, optional + Read `max_rows` lines of content after `skiprows` lines. The default + is to read all the lines. + + .. versionadded:: 1.16.0 Returns ------- @@ -828,38 +895,44 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Examples -------- >>> from io import StringIO # StringIO behaves like a file object - >>> c = StringIO("0 1\\n2 3") + >>> c = StringIO(u"0 1\\n2 3") >>> np.loadtxt(c) - array([[ 0., 1.], - [ 2., 3.]]) + array([[0., 1.], + [2., 3.]]) - >>> d = StringIO("M 21 72\\nF 35 58") + >>> d = StringIO(u"M 21 72\\nF 35 58") >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'), ... 'formats': ('S1', 'i4', 'f4')}) - array([('M', 21, 72.0), ('F', 35, 58.0)], - dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')]) + array([(b'M', 21, 72.), (b'F', 35, 58.)], + dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')]) - >>> c = StringIO("1,0,2\\n3,0,4") + >>> c = StringIO(u"1,0,2\\n3,0,4") >>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True) >>> x - array([ 1., 3.]) + array([1., 3.]) >>> y - array([ 2., 4.]) + array([2., 4.]) """ # Type conversions for Py3 convenience if comments is not None: if isinstance(comments, (basestring, bytes)): - comments = [asbytes(comments)] - else: - comments = [asbytes(comment) for comment in comments] - + comments = [comments] + comments = [_decode_line(x) for x in comments] # Compile regex for comments beforehand comments = (re.escape(comment) for comment in comments) - regex_comments = re.compile(asbytes('|').join(comments)) - user_converters = converters + regex_comments = re.compile('|'.join(comments)) + if delimiter is not None: - delimiter = asbytes(delimiter) + delimiter = _decode_line(delimiter) + + user_converters = converters + + if encoding == 'bytes': + encoding = None + byte_converters = True + else: + byte_converters = False if usecols is not None: # Allow usecols to be a single int or a sequence of ints @@ -882,27 +955,31 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, fown = False try: - if is_pathlib_path(fname): - fname = str(fname) + if isinstance(fname, os_PathLike): + fname = os_fspath(fname) if _is_string_like(fname): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + fencoding = getattr(fh, 'encoding', 'latin1') + fh = iter(fh) fown = True - if fname.endswith('.gz'): - import gzip - fh = iter(gzip.GzipFile(fname)) - elif fname.endswith('.bz2'): - import bz2 - fh = iter(bz2.BZ2File(fname)) - elif sys.version_info[0] == 2: - fh = iter(open(fname, 'U')) - else: - fh = iter(open(fname)) else: fh = iter(fname) + fencoding = getattr(fname, 'encoding', 'latin1') except TypeError: raise ValueError('fname must be a string, file handle, or generator') - X = [] - def flatten_dtype(dt): + # input may be a python2 io stream + if encoding is not None: + fencoding = encoding + # we must assume local encoding + # TODO emit portability warning? + elif fencoding is None: + import locale + fencoding = locale.getpreferredencoding() + + # not to be confused with the flatten_dtype we import... + @recursive + def flatten_dtype_internal(self, dt): """Unpack a structured data-type, and produce re-packing info.""" if dt.names is None: # If the dtype is flattened, return. @@ -922,7 +999,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, packing = [] for field in dt.names: tp, bytes = dt.fields[field] - flat_dt, flat_packing = flatten_dtype(tp) + flat_dt, flat_packing = self(tp) types.extend(flat_dt) # Avoid extra nesting for subarrays if tp.ndim > 0: @@ -931,7 +1008,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, packing.append((len(flat_dt), flat_packing)) return (types, packing) - def pack_items(items, packing): + @recursive + def pack_items(self, items, packing): """Pack items into nested lists based on re-packing info.""" if packing is None: return items[0] @@ -943,26 +1021,60 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, start = 0 ret = [] for length, subpacking in packing: - ret.append(pack_items(items[start:start+length], subpacking)) + ret.append(self(items[start:start+length], subpacking)) start += length return tuple(ret) def split_line(line): - """Chop off comments, strip, and split at delimiter. - - Note that although the file is opened as text, this function - returns bytes. + """Chop off comments, strip, and split at delimiter. """ + line = _decode_line(line, encoding=encoding) - """ - line = asbytes(line) if comments is not None: - line = regex_comments.split(asbytes(line), maxsplit=1)[0] - line = line.strip(asbytes('\r\n')) + line = regex_comments.split(line, maxsplit=1)[0] + line = line.strip('\r\n') if line: return line.split(delimiter) else: return [] + def read_data(chunk_size): + """Parse each line, including the first. + + The file read, `fh`, is a global defined above. + + Parameters + ---------- + chunk_size : int + At most `chunk_size` lines are read at a time, with iteration + until all lines are read. + + """ + X = [] + line_iter = itertools.chain([first_line], fh) + line_iter = itertools.islice(line_iter, max_rows) + for i, line in enumerate(line_iter): + vals = split_line(line) + if len(vals) == 0: + continue + if usecols: + vals = [vals[j] for j in usecols] + if len(vals) != N: + line_num = i + skiprows + 1 + raise ValueError("Wrong number of columns at line %d" + % line_num) + + # Convert each value according to its column and store + items = [conv(val) for (conv, val) in zip(converters, vals)] + + # Then pack it according to the dtype's nesting + items = pack_items(items, packing) + X.append(items) + if len(X) > chunk_size: + yield X + X = [] + if X: + yield X + try: # Make sure we're dealing with a proper dtype dtype = np.dtype(dtype) @@ -986,7 +1098,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, warnings.warn('loadtxt: Empty input file: "%s"' % fname, stacklevel=2) N = len(usecols or first_vals) - dtype_types, packing = flatten_dtype(dtype) + dtype_types, packing = flatten_dtype_internal(dtype) if len(dtype_types) > 1: # We're dealing with a structured array, each field of # the dtype matches a column @@ -1005,30 +1117,41 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, except ValueError: # Unused converter specified continue - converters[i] = conv - - # Parse each line, including the first - for i, line in enumerate(itertools.chain([first_line], fh)): - vals = split_line(line) - if len(vals) == 0: - continue - if usecols: - vals = [vals[i] for i in usecols] - if len(vals) != N: - line_num = i + skiprows + 1 - raise ValueError("Wrong number of columns at line %d" - % line_num) - - # Convert each value according to its column and store - items = [conv(val) for (conv, val) in zip(converters, vals)] - # Then pack it according to the dtype's nesting - items = pack_items(items, packing) - X.append(items) + if byte_converters: + # converters may use decode to workaround numpy's old behaviour, + # so encode the string again before passing to the user converter + def tobytes_first(x, conv): + if type(x) is bytes: + return conv(x) + return conv(x.encode("latin1")) + converters[i] = functools.partial(tobytes_first, conv=conv) + else: + converters[i] = conv + + converters = [conv if conv is not bytes else + lambda x: x.encode(fencoding) for conv in converters] + + # read data in chunks and fill it into an array via resize + # over-allocating and shrinking the array later may be faster but is + # probably not relevant compared to the cost of actually reading and + # converting the data + X = None + for x in read_data(_loadtxt_chunksize): + if X is None: + X = np.array(x, dtype) + else: + nshape = list(X.shape) + pos = nshape[0] + nshape[0] += len(x) + X.resize(nshape, refcheck=False) + X[pos:, ...] = x finally: if fown: fh.close() - X = np.array(X, dtype) + if X is None: + X = np.array([], dtype) + # Multicolumn data are returned with shape (1, N, M), i.e. # (1, 1, M) for a single row - remove the singleton dimension there if X.ndim == 3 and X.shape[:2] == (1, 1): @@ -1059,8 +1182,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, return X +def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None, + header=None, footer=None, comments=None, + encoding=None): + return (X,) + + +@array_function_dispatch(_savetxt_dispatcher) def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', - footer='', comments='# '): + footer='', comments='# ', encoding=None): """ Save an array to a text file. @@ -1070,20 +1200,21 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', If the filename ends in ``.gz``, the file is automatically saved in compressed gzip format. `loadtxt` understands gzipped files transparently. - X : array_like + X : 1D or 2D array_like Data to be saved to a text file. fmt : str or sequence of strs, optional A single format (%10.5f), a sequence of formats, or a multi-format string, e.g. 'Iteration %d -- %10.5f', in which case `delimiter` is ignored. For complex `X`, the legal options for `fmt` are: - a) a single specifier, `fmt='%.4e'`, resulting in numbers formatted - like `' (%s+%sj)' % (fmt, fmt)` - b) a full string specifying every real and imaginary part, e.g. - `' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'` for 3 columns - c) a list of specifiers, one per column - in this case, the real - and imaginary part must have separate specifiers, - e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns + + * a single specifier, `fmt='%.4e'`, resulting in numbers formatted + like `' (%s+%sj)' % (fmt, fmt)` + * a full string specifying every real and imaginary part, e.g. + `' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'` for 3 columns + * a list of specifiers, one per column - in this case, the real + and imaginary part must have separate specifiers, + e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns delimiter : str, optional String or character separating columns. newline : str, optional @@ -1104,6 +1235,13 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', ``numpy.loadtxt``. .. versionadded:: 1.7.0 + encoding : {None, str}, optional + Encoding used to encode the outputfile. Does not apply to output + streams. If the encoding is something other than 'bytes' or 'latin1' + you will not be able to load the file in NumPy versions < 1.14. Default + is 'latin1'. + + .. versionadded:: 1.14.0 See Also @@ -1161,8 +1299,8 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', References ---------- .. [1] `Format Specification Mini-Language - <http://docs.python.org/library/string.html# - format-specification-mini-language>`_, Python Documentation. + <https://docs.python.org/library/string.html#format-specification-mini-language>`_, + Python Documentation. Examples -------- @@ -1178,21 +1316,53 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', fmt = asstr(fmt) delimiter = asstr(delimiter) + class WriteWrap(object): + """Convert to unicode in py2 or to bytes on bytestream inputs. + + """ + def __init__(self, fh, encoding): + self.fh = fh + self.encoding = encoding + self.do_write = self.first_write + + def close(self): + self.fh.close() + + def write(self, v): + self.do_write(v) + + def write_bytes(self, v): + if isinstance(v, bytes): + self.fh.write(v) + else: + self.fh.write(v.encode(self.encoding)) + + def write_normal(self, v): + self.fh.write(asunicode(v)) + + def first_write(self, v): + try: + self.write_normal(v) + self.write = self.write_normal + except TypeError: + # input is probably a bytestream + self.write_bytes(v) + self.write = self.write_bytes + own_fh = False - if is_pathlib_path(fname): - fname = str(fname) + if isinstance(fname, os_PathLike): + fname = os_fspath(fname) if _is_string_like(fname): + # datasource doesn't support creating a new file ... + open(fname, 'wt').close() + fh = np.lib._datasource.open(fname, 'wt', encoding=encoding) own_fh = True - if fname.endswith('.gz'): - import gzip - fh = gzip.open(fname, 'wb') - else: - if sys.version_info[0] >= 3: - fh = open(fname, 'wb') - else: - fh = open(fname, 'w') + # need to convert str to unicode for text io output + if sys.version_info[0] == 2: + fh = WriteWrap(fh, encoding or 'latin1') elif hasattr(fname, 'write'): - fh = fname + # wrap to handle byte output streams + fh = WriteWrap(fname, encoding or 'latin1') else: raise ValueError('fname must be a string or file handle') @@ -1200,7 +1370,10 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', X = np.asarray(X) # Handle 1-dimensional arrays - if X.ndim == 1: + if X.ndim == 0 or X.ndim > 2: + raise ValueError( + "Expected 1D or 2D array, got %dD array instead" % X.ndim) + elif X.ndim == 1: # Common case -- 1d array of numbers if X.dtype.names is None: X = np.atleast_2d(X).T @@ -1208,7 +1381,7 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', # Complex dtype -- each field indicates a separate column else: - ncol = len(X.dtype.descr) + ncol = len(X.dtype.names) else: ncol = X.shape[1] @@ -1239,31 +1412,35 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', if len(header) > 0: header = header.replace('\n', '\n' + comments) - fh.write(asbytes(comments + header + newline)) + fh.write(comments + header + newline) if iscomplex_X: for row in X: row2 = [] for number in row: row2.append(number.real) row2.append(number.imag) - fh.write(asbytes(format % tuple(row2) + newline)) + s = format % tuple(row2) + newline + fh.write(s.replace('+-', '-')) else: for row in X: try: - fh.write(asbytes(format % tuple(row) + newline)) + v = format % tuple(row) + newline except TypeError: raise TypeError("Mismatch between array dtype ('%s') and " "format specifier ('%s')" % (str(X.dtype), format)) + fh.write(v) + if len(footer) > 0: footer = footer.replace('\n', '\n' + comments) - fh.write(asbytes(comments + footer + newline)) + fh.write(comments + footer + newline) finally: if own_fh: fh.close() -def fromregex(file, regexp, dtype): +@set_module('numpy') +def fromregex(file, regexp, dtype, encoding=None): """ Construct an array from a text file, using regular expression parsing. @@ -1280,6 +1457,10 @@ def fromregex(file, regexp, dtype): Groups in the regular expression correspond to fields in the dtype. dtype : dtype or list of dtypes Dtype for the structured array. + encoding : str, optional + Encoding used to decode the inputfile. Does not apply to input streams. + + .. versionadded:: 1.14.0 Returns ------- @@ -1305,31 +1486,37 @@ def fromregex(file, regexp, dtype): Examples -------- >>> f = open('test.dat', 'w') - >>> f.write("1312 foo\\n1534 bar\\n444 qux") + >>> _ = f.write("1312 foo\\n1534 bar\\n444 qux") >>> f.close() >>> regexp = r"(\\d+)\\s+(...)" # match [digits, whitespace, anything] >>> output = np.fromregex('test.dat', regexp, ... [('num', np.int64), ('key', 'S3')]) >>> output - array([(1312L, 'foo'), (1534L, 'bar'), (444L, 'qux')], - dtype=[('num', '<i8'), ('key', '|S3')]) + array([(1312, b'foo'), (1534, b'bar'), ( 444, b'qux')], + dtype=[('num', '<i8'), ('key', 'S3')]) >>> output['num'] - array([1312, 1534, 444], dtype=int64) + array([1312, 1534, 444]) """ own_fh = False if not hasattr(file, "read"): - file = open(file, 'rb') + file = np.lib._datasource.open(file, 'rt', encoding=encoding) own_fh = True try: - if not hasattr(regexp, 'match'): - regexp = re.compile(asbytes(regexp)) if not isinstance(dtype, np.dtype): dtype = np.dtype(dtype) - seq = regexp.findall(file.read()) + content = file.read() + if isinstance(content, bytes) and isinstance(regexp, np.unicode): + regexp = asbytes(regexp) + elif isinstance(content, np.unicode) and isinstance(regexp, bytes): + regexp = asstr(regexp) + + if not hasattr(regexp, 'match'): + regexp = re.compile(regexp) + seq = regexp.findall(content) if seq and not isinstance(seq[0], tuple): # Only one group is in the regexp. # Create the new array as a single data-type and then @@ -1351,13 +1538,14 @@ def fromregex(file, regexp, dtype): #####-------------------------------------------------------------------------- +@set_module('numpy') def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skip_header=0, skip_footer=0, converters=None, missing_values=None, filling_values=None, usecols=None, names=None, excludelist=None, deletechars=None, replace_space='_', autostrip=False, case_sensitive=True, defaultfmt="f%i", unpack=None, usemask=False, loose=True, - invalid_raise=True, max_rows=None): + invalid_raise=True, max_rows=None, encoding='bytes'): """ Load data from a text file, with missing values handled as specified. @@ -1403,11 +1591,12 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, Which columns to read, with 0 being the first. For example, ``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns. names : {None, True, str, sequence}, optional - If `names` is True, the field names are read from the first valid line - after the first `skip_header` lines. - If `names` is a sequence or a single-string of comma-separated names, - the names will be used to define the field names in a structured dtype. - If `names` is None, the names of the dtype fields will be used, if any. + If `names` is True, the field names are read from the first line after + the first `skip_header` lines. This line can optionally be proceeded + by a comment delimiter. If `names` is a sequence or a single-string of + comma-separated names, the names will be used to define the field names + in a structured dtype. If `names` is None, the names of the dtype + fields will be used, if any. excludelist : sequence, optional A list of names to exclude. This list is appended to the default list ['return','file','print']. Excluded names are appended an underscore: @@ -1444,6 +1633,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, to read the entire file. .. versionadded:: 1.10.0 + encoding : str, optional + Encoding used to decode the inputfile. Does not apply when `fname` is + a file object. The special value 'bytes' enables backward compatibility + workarounds that ensure that you receive byte arrays when possible + and passes latin1 encoded strings to converters. Override this value to + receive unicode arrays and pass strings as input to converters. If set + to None the system default is used. The default value is 'bytes'. + + .. versionadded:: 1.14.0 Returns ------- @@ -1468,7 +1666,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, References ---------- .. [1] NumPy User Guide, section `I/O with NumPy - <http://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_. + <https://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_. Examples --------- @@ -1477,39 +1675,39 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, Comma delimited file with mixed dtype - >>> s = StringIO("1,1.3,abcde") + >>> s = StringIO(u"1,1.3,abcde") >>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'), ... ('mystring','S5')], delimiter=",") >>> data - array((1, 1.3, 'abcde'), - dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')]) + array((1, 1.3, b'abcde'), + dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')]) Using dtype = None - >>> s.seek(0) # needed for StringIO example only + >>> _ = s.seek(0) # needed for StringIO example only >>> data = np.genfromtxt(s, dtype=None, ... names = ['myint','myfloat','mystring'], delimiter=",") >>> data - array((1, 1.3, 'abcde'), - dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')]) + array((1, 1.3, b'abcde'), + dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')]) Specifying dtype and names - >>> s.seek(0) + >>> _ = s.seek(0) >>> data = np.genfromtxt(s, dtype="i8,f8,S5", ... names=['myint','myfloat','mystring'], delimiter=",") >>> data - array((1, 1.3, 'abcde'), - dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')]) + array((1, 1.3, b'abcde'), + dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')]) An example with fixed-width columns - >>> s = StringIO("11.3abcde") + >>> s = StringIO(u"11.3abcde") >>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'], ... delimiter=[1,3,5]) >>> data - array((1, 1.3, 'abcde'), - dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '|S5')]) + array((1, 1.3, b'abcde'), + dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', 'S5')]) """ if max_rows is not None: @@ -1520,15 +1718,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, if max_rows < 1: raise ValueError("'max_rows' must be at least 1.") - # Py3 data conversions to bytes, for convenience - if comments is not None: - comments = asbytes(comments) - if isinstance(delimiter, unicode): - delimiter = asbytes(delimiter) - if isinstance(missing_values, (unicode, list, tuple)): - missing_values = asbytes_nested(missing_values) - - # if usemask: from numpy.ma import MaskedArray, make_mask_descr # Check the input dictionary of converters @@ -1538,290 +1727,306 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, "The input argument 'converter' should be a valid dictionary " "(got '%s' instead)" % type(user_converters)) + if encoding == 'bytes': + encoding = None + byte_converters = True + else: + byte_converters = False + # Initialize the filehandle, the LineSplitter and the NameValidator - own_fhd = False try: - if is_pathlib_path(fname): - fname = str(fname) + if isinstance(fname, os_PathLike): + fname = os_fspath(fname) if isinstance(fname, basestring): - if sys.version_info[0] == 2: - fhd = iter(np.lib._datasource.open(fname, 'rbU')) - else: - fhd = iter(np.lib._datasource.open(fname, 'rb')) - own_fhd = True + fid = np.lib._datasource.open(fname, 'rt', encoding=encoding) + fid_ctx = contextlib.closing(fid) else: - fhd = iter(fname) + fid = fname + fid_ctx = contextlib_nullcontext(fid) + fhd = iter(fid) except TypeError: raise TypeError( "fname must be a string, filehandle, list of strings, " "or generator. Got %s instead." % type(fname)) - split_line = LineSplitter(delimiter=delimiter, comments=comments, - autostrip=autostrip)._handyman - validate_names = NameValidator(excludelist=excludelist, - deletechars=deletechars, - case_sensitive=case_sensitive, - replace_space=replace_space) + with fid_ctx: + split_line = LineSplitter(delimiter=delimiter, comments=comments, + autostrip=autostrip, encoding=encoding) + validate_names = NameValidator(excludelist=excludelist, + deletechars=deletechars, + case_sensitive=case_sensitive, + replace_space=replace_space) - # Skip the first `skip_header` rows - for i in range(skip_header): - next(fhd) + # Skip the first `skip_header` rows + for i in range(skip_header): + next(fhd) - # Keep on until we find the first valid values - first_values = None - try: - while not first_values: - first_line = next(fhd) - if names is True: - if comments in first_line: - first_line = ( - asbytes('').join(first_line.split(comments)[1:])) - first_values = split_line(first_line) - except StopIteration: - # return an empty array if the datafile is empty - first_line = asbytes('') - first_values = [] - warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2) - - # Should we take the first values as names ? - if names is True: - fval = first_values[0].strip() - if fval in comments: - del first_values[0] - - # Check the columns to use: make sure `usecols` is a list - if usecols is not None: + # Keep on until we find the first valid values + first_values = None try: - usecols = [_.strip() for _ in usecols.split(",")] - except AttributeError: + while not first_values: + first_line = _decode_line(next(fhd), encoding) + if (names is True) and (comments is not None): + if comments in first_line: + first_line = ( + ''.join(first_line.split(comments)[1:])) + first_values = split_line(first_line) + except StopIteration: + # return an empty array if the datafile is empty + first_line = '' + first_values = [] + warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2) + + # Should we take the first values as names ? + if names is True: + fval = first_values[0].strip() + if comments is not None: + if fval in comments: + del first_values[0] + + # Check the columns to use: make sure `usecols` is a list + if usecols is not None: try: - usecols = list(usecols) - except TypeError: - usecols = [usecols, ] - nbcols = len(usecols or first_values) - - # Check the names and overwrite the dtype.names if needed - if names is True: - names = validate_names([_bytes_to_name(_.strip()) - for _ in first_values]) - first_line = asbytes('') - elif _is_string_like(names): - names = validate_names([_.strip() for _ in names.split(',')]) - elif names: - names = validate_names(names) - # Get the dtype - if dtype is not None: - dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names, - excludelist=excludelist, - deletechars=deletechars, - case_sensitive=case_sensitive, - replace_space=replace_space) - # Make sure the names is a list (for 2.5) - if names is not None: - names = list(names) - - if usecols: - for (i, current) in enumerate(usecols): - # if usecols is a list of names, convert to a list of indices - if _is_string_like(current): - usecols[i] = names.index(current) - elif current < 0: - usecols[i] = current + len(first_values) - # If the dtype is not None, make sure we update it - if (dtype is not None) and (len(dtype) > nbcols): - descr = dtype.descr - dtype = np.dtype([descr[_] for _ in usecols]) - names = list(dtype.names) - # If `names` is not None, update the names - elif (names is not None) and (len(names) > nbcols): - names = [names[_] for _ in usecols] - elif (names is not None) and (dtype is not None): - names = list(dtype.names) - - # Process the missing values ............................... - # Rename missing_values for convenience - user_missing_values = missing_values or () - - # Define the list of missing_values (one column: one list) - missing_values = [list([asbytes('')]) for _ in range(nbcols)] - - # We have a dictionary: process it field by field - if isinstance(user_missing_values, dict): - # Loop on the items - for (key, val) in user_missing_values.items(): - # Is the key a string ? - if _is_string_like(key): + usecols = [_.strip() for _ in usecols.split(",")] + except AttributeError: try: - # Transform it into an integer - key = names.index(key) - except ValueError: - # We couldn't find it: the name must have been dropped - continue - # Redefine the key as needed if it's a column number - if usecols: - try: - key = usecols.index(key) - except ValueError: - pass - # Transform the value as a list of string - if isinstance(val, (list, tuple)): - val = [str(_) for _ in val] + usecols = list(usecols) + except TypeError: + usecols = [usecols, ] + nbcols = len(usecols or first_values) + + # Check the names and overwrite the dtype.names if needed + if names is True: + names = validate_names([str(_.strip()) for _ in first_values]) + first_line = '' + elif _is_string_like(names): + names = validate_names([_.strip() for _ in names.split(',')]) + elif names: + names = validate_names(names) + # Get the dtype + if dtype is not None: + dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names, + excludelist=excludelist, + deletechars=deletechars, + case_sensitive=case_sensitive, + replace_space=replace_space) + # Make sure the names is a list (for 2.5) + if names is not None: + names = list(names) + + if usecols: + for (i, current) in enumerate(usecols): + # if usecols is a list of names, convert to a list of indices + if _is_string_like(current): + usecols[i] = names.index(current) + elif current < 0: + usecols[i] = current + len(first_values) + # If the dtype is not None, make sure we update it + if (dtype is not None) and (len(dtype) > nbcols): + descr = dtype.descr + dtype = np.dtype([descr[_] for _ in usecols]) + names = list(dtype.names) + # If `names` is not None, update the names + elif (names is not None) and (len(names) > nbcols): + names = [names[_] for _ in usecols] + elif (names is not None) and (dtype is not None): + names = list(dtype.names) + + # Process the missing values ............................... + # Rename missing_values for convenience + user_missing_values = missing_values or () + if isinstance(user_missing_values, bytes): + user_missing_values = user_missing_values.decode('latin1') + + # Define the list of missing_values (one column: one list) + missing_values = [list(['']) for _ in range(nbcols)] + + # We have a dictionary: process it field by field + if isinstance(user_missing_values, dict): + # Loop on the items + for (key, val) in user_missing_values.items(): + # Is the key a string ? + if _is_string_like(key): + try: + # Transform it into an integer + key = names.index(key) + except ValueError: + # We couldn't find it: the name must have been dropped + continue + # Redefine the key as needed if it's a column number + if usecols: + try: + key = usecols.index(key) + except ValueError: + pass + # Transform the value as a list of string + if isinstance(val, (list, tuple)): + val = [str(_) for _ in val] + else: + val = [str(val), ] + # Add the value(s) to the current list of missing + if key is None: + # None acts as default + for miss in missing_values: + miss.extend(val) + else: + missing_values[key].extend(val) + # We have a sequence : each item matches a column + elif isinstance(user_missing_values, (list, tuple)): + for (value, entry) in zip(user_missing_values, missing_values): + value = str(value) + if value not in entry: + entry.append(value) + # We have a string : apply it to all entries + elif isinstance(user_missing_values, basestring): + user_value = user_missing_values.split(",") + for entry in missing_values: + entry.extend(user_value) + # We have something else: apply it to all entries + else: + for entry in missing_values: + entry.extend([str(user_missing_values)]) + + # Process the filling_values ............................... + # Rename the input for convenience + user_filling_values = filling_values + if user_filling_values is None: + user_filling_values = [] + # Define the default + filling_values = [None] * nbcols + # We have a dictionary : update each entry individually + if isinstance(user_filling_values, dict): + for (key, val) in user_filling_values.items(): + if _is_string_like(key): + try: + # Transform it into an integer + key = names.index(key) + except ValueError: + # We couldn't find it: the name must have been dropped, + continue + # Redefine the key if it's a column number and usecols is defined + if usecols: + try: + key = usecols.index(key) + except ValueError: + pass + # Add the value to the list + filling_values[key] = val + # We have a sequence : update on a one-to-one basis + elif isinstance(user_filling_values, (list, tuple)): + n = len(user_filling_values) + if (n <= nbcols): + filling_values[:n] = user_filling_values else: - val = [str(val), ] - # Add the value(s) to the current list of missing - if key is None: - # None acts as default - for miss in missing_values: - miss.extend(val) + filling_values = user_filling_values[:nbcols] + # We have something else : use it for all entries + else: + filling_values = [user_filling_values] * nbcols + + # Initialize the converters ................................ + if dtype is None: + # Note: we can't use a [...]*nbcols, as we would have 3 times the same + # ... converter, instead of 3 different converters. + converters = [StringConverter(None, missing_values=miss, default=fill) + for (miss, fill) in zip(missing_values, filling_values)] + else: + dtype_flat = flatten_dtype(dtype, flatten_base=True) + # Initialize the converters + if len(dtype_flat) > 1: + # Flexible type : get a converter from each dtype + zipit = zip(dtype_flat, missing_values, filling_values) + converters = [StringConverter(dt, locked=True, + missing_values=miss, default=fill) + for (dt, miss, fill) in zipit] else: - missing_values[key].extend(val) - # We have a sequence : each item matches a column - elif isinstance(user_missing_values, (list, tuple)): - for (value, entry) in zip(user_missing_values, missing_values): - value = str(value) - if value not in entry: - entry.append(value) - # We have a string : apply it to all entries - elif isinstance(user_missing_values, bytes): - user_value = user_missing_values.split(asbytes(",")) - for entry in missing_values: - entry.extend(user_value) - # We have something else: apply it to all entries - else: - for entry in missing_values: - entry.extend([str(user_missing_values)]) - - # Process the filling_values ............................... - # Rename the input for convenience - user_filling_values = filling_values - if user_filling_values is None: - user_filling_values = [] - # Define the default - filling_values = [None] * nbcols - # We have a dictionary : update each entry individually - if isinstance(user_filling_values, dict): - for (key, val) in user_filling_values.items(): - if _is_string_like(key): + # Set to a default converter (but w/ different missing values) + zipit = zip(missing_values, filling_values) + converters = [StringConverter(dtype, locked=True, + missing_values=miss, default=fill) + for (miss, fill) in zipit] + # Update the converters to use the user-defined ones + uc_update = [] + for (j, conv) in user_converters.items(): + # If the converter is specified by column names, use the index instead + if _is_string_like(j): try: - # Transform it into an integer - key = names.index(key) + j = names.index(j) + i = j except ValueError: - # We couldn't find it: the name must have been dropped, continue - # Redefine the key if it's a column number and usecols is defined - if usecols: + elif usecols: try: - key = usecols.index(key) + i = usecols.index(j) except ValueError: - pass - # Add the value to the list - filling_values[key] = val - # We have a sequence : update on a one-to-one basis - elif isinstance(user_filling_values, (list, tuple)): - n = len(user_filling_values) - if (n <= nbcols): - filling_values[:n] = user_filling_values - else: - filling_values = user_filling_values[:nbcols] - # We have something else : use it for all entries - else: - filling_values = [user_filling_values] * nbcols - - # Initialize the converters ................................ - if dtype is None: - # Note: we can't use a [...]*nbcols, as we would have 3 times the same - # ... converter, instead of 3 different converters. - converters = [StringConverter(None, missing_values=miss, default=fill) - for (miss, fill) in zip(missing_values, filling_values)] - else: - dtype_flat = flatten_dtype(dtype, flatten_base=True) - # Initialize the converters - if len(dtype_flat) > 1: - # Flexible type : get a converter from each dtype - zipit = zip(dtype_flat, missing_values, filling_values) - converters = [StringConverter(dt, locked=True, - missing_values=miss, default=fill) - for (dt, miss, fill) in zipit] - else: - # Set to a default converter (but w/ different missing values) - zipit = zip(missing_values, filling_values) - converters = [StringConverter(dtype, locked=True, - missing_values=miss, default=fill) - for (miss, fill) in zipit] - # Update the converters to use the user-defined ones - uc_update = [] - for (j, conv) in user_converters.items(): - # If the converter is specified by column names, use the index instead - if _is_string_like(j): - try: - j = names.index(j) + # Unused converter specified + continue + else: i = j - except ValueError: - continue - elif usecols: - try: - i = usecols.index(j) - except ValueError: - # Unused converter specified + # Find the value to test - first_line is not filtered by usecols: + if len(first_line): + testing_value = first_values[j] + else: + testing_value = None + if conv is bytes: + user_conv = asbytes + elif byte_converters: + # converters may use decode to workaround numpy's old behaviour, + # so encode the string again before passing to the user converter + def tobytes_first(x, conv): + if type(x) is bytes: + return conv(x) + return conv(x.encode("latin1")) + user_conv = functools.partial(tobytes_first, conv=conv) + else: + user_conv = conv + converters[i].update(user_conv, locked=True, + testing_value=testing_value, + default=filling_values[i], + missing_values=missing_values[i],) + uc_update.append((i, user_conv)) + # Make sure we have the corrected keys in user_converters... + user_converters.update(uc_update) + + # Fixme: possible error as following variable never used. + # miss_chars = [_.missing_values for _ in converters] + + # Initialize the output lists ... + # ... rows + rows = [] + append_to_rows = rows.append + # ... masks + if usemask: + masks = [] + append_to_masks = masks.append + # ... invalid + invalid = [] + append_to_invalid = invalid.append + + # Parse each line + for (i, line) in enumerate(itertools.chain([first_line, ], fhd)): + values = split_line(line) + nbvalues = len(values) + # Skip an empty line + if nbvalues == 0: continue - else: - i = j - # Find the value to test - first_line is not filtered by usecols: - if len(first_line): - testing_value = first_values[j] - else: - testing_value = None - converters[i].update(conv, locked=True, - testing_value=testing_value, - default=filling_values[i], - missing_values=missing_values[i],) - uc_update.append((i, conv)) - # Make sure we have the corrected keys in user_converters... - user_converters.update(uc_update) - - # Fixme: possible error as following variable never used. - #miss_chars = [_.missing_values for _ in converters] - - # Initialize the output lists ... - # ... rows - rows = [] - append_to_rows = rows.append - # ... masks - if usemask: - masks = [] - append_to_masks = masks.append - # ... invalid - invalid = [] - append_to_invalid = invalid.append - - # Parse each line - for (i, line) in enumerate(itertools.chain([first_line, ], fhd)): - values = split_line(line) - nbvalues = len(values) - # Skip an empty line - if nbvalues == 0: - continue - if usecols: - # Select only the columns we need - try: - values = [values[_] for _ in usecols] - except IndexError: + if usecols: + # Select only the columns we need + try: + values = [values[_] for _ in usecols] + except IndexError: + append_to_invalid((i + skip_header + 1, nbvalues)) + continue + elif nbvalues != nbcols: append_to_invalid((i + skip_header + 1, nbvalues)) continue - elif nbvalues != nbcols: - append_to_invalid((i + skip_header + 1, nbvalues)) - continue - # Store the values - append_to_rows(tuple(values)) - if usemask: - append_to_masks(tuple([v.strip() in m - for (v, m) in zip(values, - missing_values)])) - if len(rows) == max_rows: - break - - if own_fhd: - fhd.close() + # Store the values + append_to_rows(tuple(values)) + if usemask: + append_to_masks(tuple([v.strip() in m + for (v, m) in zip(values, + missing_values)])) + if len(rows) == max_rows: + break # Upgrade the converters (if needed) if dtype is None: @@ -1892,25 +2097,54 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, column_types = [conv.type for conv in converters] # Find the columns with strings... strcolidx = [i for (i, v) in enumerate(column_types) - if v in (type('S'), np.string_)] - # ... and take the largest number of chars. - for i in strcolidx: - column_types[i] = "|S%i" % max(len(row[i]) for row in data) - # + if v == np.unicode_] + + if byte_converters and strcolidx: + # convert strings back to bytes for backward compatibility + warnings.warn( + "Reading unicode strings without specifying the encoding " + "argument is deprecated. Set the encoding, use None for the " + "system default.", + np.VisibleDeprecationWarning, stacklevel=2) + def encode_unicode_cols(row_tup): + row = list(row_tup) + for i in strcolidx: + row[i] = row[i].encode('latin1') + return tuple(row) + + try: + data = [encode_unicode_cols(r) for r in data] + except UnicodeEncodeError: + pass + else: + for i in strcolidx: + column_types[i] = np.bytes_ + + # Update string types to be the right length + sized_column_types = column_types[:] + for i, col_type in enumerate(column_types): + if np.issubdtype(col_type, np.character): + n_chars = max(len(row[i]) for row in data) + sized_column_types[i] = (col_type, n_chars) + if names is None: - # If the dtype is uniform, don't define names, else use '' - base = set([c.type for c in converters if c._checked]) + # If the dtype is uniform (before sizing strings) + base = { + c_type + for c, c_type in zip(converters, column_types) + if c._checked} if len(base) == 1: - (ddtype, mdtype) = (list(base)[0], np.bool) + uniform_type, = base + (ddtype, mdtype) = (uniform_type, bool) else: ddtype = [(defaultfmt % i, dt) - for (i, dt) in enumerate(column_types)] + for (i, dt) in enumerate(sized_column_types)] if usemask: - mdtype = [(defaultfmt % i, np.bool) - for (i, dt) in enumerate(column_types)] + mdtype = [(defaultfmt % i, bool) + for (i, dt) in enumerate(sized_column_types)] else: - ddtype = list(zip(names, column_types)) - mdtype = list(zip(names, [np.bool] * len(column_types))) + ddtype = list(zip(names, sized_column_types)) + mdtype = list(zip(names, [bool] * len(sized_column_types))) output = np.array(data, dtype=ddtype) if usemask: outputmask = np.array(masks, dtype=mdtype) @@ -1936,7 +2170,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Now, process the rowmasks the same way if usemask: rowmasks = np.array( - masks, dtype=np.dtype([('', np.bool) for t in dtype_flat])) + masks, dtype=np.dtype([('', bool) for t in dtype_flat])) # Construct the new dtype mdtype = make_mask_descr(dtype) outputmask = rowmasks.view(mdtype) @@ -1950,8 +2184,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, # Keep the dtype of the current converter if i in user_converters: ishomogeneous &= (ttype == dtype.type) - if ttype == np.string_: - ttype = "|S%i" % max(len(row[i]) for row in data) + if np.issubdtype(ttype, np.character): + ttype = (ttype, max(len(row[i]) for row in data)) descr.append(('', ttype)) else: descr.append(('', dtype)) @@ -1967,16 +2201,16 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, output = np.array(data, dtype) if usemask: if dtype.names: - mdtype = [(_, np.bool) for _ in dtype.names] + mdtype = [(_, bool) for _ in dtype.names] else: - mdtype = np.bool + mdtype = bool outputmask = np.array(masks, dtype=mdtype) # Try to take care of the missing data we missed names = output.dtype.names if usemask and names: - for (name, conv) in zip(names or (), converters): + for (name, conv) in zip(names, converters): missing_values = [conv(_) for _ in conv.missing_values - if _ != asbytes('')] + if _ != ''] for mval in missing_values: outputmask[name] |= (output[name] == mval) # Construct the final array |