summaryrefslogtreecommitdiff
path: root/numpy/lib/npyio.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r--numpy/lib/npyio.py1308
1 files changed, 771 insertions, 537 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index 0dee6b333..d6d2a0c6c 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -3,32 +3,46 @@ from __future__ import division, absolute_import, print_function
import sys
import os
import re
+import functools
import itertools
import warnings
import weakref
+import contextlib
from operator import itemgetter, index as opindex
import numpy as np
from . import format
from ._datasource import DataSource
+from numpy.core import overrides
from numpy.core.multiarray import packbits, unpackbits
+from numpy.core.overrides import set_module
+from numpy.core._internal import recursive
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
ConverterLockError, ConversionWarning, _is_string_like,
- has_nested_fields, flatten_dtype, easy_dtype, _bytes_to_name
+ has_nested_fields, flatten_dtype, easy_dtype, _decode_line
)
from numpy.compat import (
- asbytes, asstr, asbytes_nested, bytes, basestring, unicode, is_pathlib_path
+ asbytes, asstr, asunicode, bytes, basestring, os_fspath, os_PathLike,
+ pickle, contextlib_nullcontext
)
if sys.version_info[0] >= 3:
- import pickle
+ from collections.abc import Mapping
else:
- import cPickle as pickle
from future_builtins import map
+ from collections import Mapping
+
+
+@set_module('numpy')
+def loads(*args, **kwargs):
+ # NumPy 1.15.0, 2017-12-10
+ warnings.warn(
+ "np.loads is deprecated, use pickle.loads instead",
+ DeprecationWarning, stacklevel=2)
+ return pickle.loads(*args, **kwargs)
-loads = pickle.loads
__all__ = [
'savetxt', 'loadtxt', 'genfromtxt', 'ndfromtxt', 'mafromtxt',
@@ -37,6 +51,10 @@ __all__ = [
]
+array_function_dispatch = functools.partial(
+ overrides.array_function_dispatch, module='numpy')
+
+
class BagObj(object):
"""
BagObj(obj)
@@ -83,7 +101,7 @@ class BagObj(object):
This also enables tab-completion in an interpreter or IPython.
"""
- return object.__getattribute__(self, '_obj').keys()
+ return list(object.__getattribute__(self, '_obj').keys())
def zipfile_factory(file, *args, **kwargs):
@@ -94,14 +112,14 @@ def zipfile_factory(file, *args, **kwargs):
pathlib.Path objects. `args` and `kwargs` are passed to the zipfile.ZipFile
constructor.
"""
- if is_pathlib_path(file):
- file = str(file)
+ if not hasattr(file, 'read'):
+ file = os_fspath(file)
import zipfile
kwargs['allowZip64'] = True
return zipfile.ZipFile(file, *args, **kwargs)
-class NpzFile(object):
+class NpzFile(Mapping):
"""
NpzFile(fid)
@@ -150,13 +168,13 @@ class NpzFile(object):
>>> x = np.arange(10)
>>> y = np.sin(x)
>>> np.savez(outfile, x=x, y=y)
- >>> outfile.seek(0)
+ >>> _ = outfile.seek(0)
>>> npz = np.load(outfile)
>>> isinstance(npz, np.lib.io.NpzFile)
True
- >>> npz.files
- ['y', 'x']
+ >>> sorted(npz.files)
+ ['x', 'y']
>>> npz['x'] # getitem access
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> npz.f.x # attribute lookup
@@ -207,6 +225,13 @@ class NpzFile(object):
def __del__(self):
self.close()
+ # Implement the Mapping ABC
+ def __iter__(self):
+ return iter(self.files)
+
+ def __len__(self):
+ return len(self.files)
+
def __getitem__(self, key):
# FIXME: This seems like it will copy strings around
# more than is strictly necessary. The zipfile
@@ -216,11 +241,11 @@ class NpzFile(object):
# It would be better if the zipfile could read
# (or at least uncompress) the data
# directly into the array memory.
- member = 0
+ member = False
if key in self._files:
- member = 1
+ member = True
elif key in self.files:
- member = 1
+ member = True
key += '.npy'
if member:
bytes = self.zip.open(key)
@@ -236,38 +261,41 @@ class NpzFile(object):
else:
raise KeyError("%s is not a file in the archive" % key)
- def __iter__(self):
- return iter(self.files)
-
- def items(self):
- """
- Return a list of tuples, with each tuple (filename, array in file).
- """
- return [(f, self[f]) for f in self.files]
+ if sys.version_info.major == 3:
+ # deprecate the python 2 dict apis that we supported by accident in
+ # python 3. We forgot to implement itervalues() at all in earlier
+ # versions of numpy, so no need to deprecated it here.
- def iteritems(self):
- """Generator that returns tuples (filename, array in file)."""
- for f in self.files:
- yield (f, self[f])
+ def iteritems(self):
+ # Numpy 1.15, 2018-02-20
+ warnings.warn(
+ "NpzFile.iteritems is deprecated in python 3, to match the "
+ "removal of dict.itertems. Use .items() instead.",
+ DeprecationWarning, stacklevel=2)
+ return self.items()
- def keys(self):
- """Return files in the archive with a ``.npy`` extension."""
- return self.files
-
- def iterkeys(self):
- """Return an iterator over the files in the archive."""
- return self.__iter__()
-
- def __contains__(self, key):
- return self.files.__contains__(key)
+ def iterkeys(self):
+ # Numpy 1.15, 2018-02-20
+ warnings.warn(
+ "NpzFile.iterkeys is deprecated in python 3, to match the "
+ "removal of dict.iterkeys. Use .keys() instead.",
+ DeprecationWarning, stacklevel=2)
+ return self.keys()
+@set_module('numpy')
def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
encoding='ASCII'):
"""
Load arrays or pickled objects from ``.npy``, ``.npz`` or pickled files.
+ .. warning:: Loading files that contain object arrays uses the ``pickle``
+ module, which is not secure against erroneous or maliciously
+ constructed data. Consider passing ``allow_pickle=False`` to
+ load data that is known not to contain object arrays for the
+ safer handling of untrusted sources.
+
Parameters
----------
file : file-like object, string, or pathlib.Path
@@ -294,7 +322,7 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
used in Python 3.
encoding : str, optional
What encoding to use when reading Python 2 strings. Only useful when
- loading Python 2 generated pickled files on Python 3, which includes
+ loading Python 2 generated pickled files in Python 3, which includes
npy/npz files containing object arrays. Values other than 'latin1',
'ASCII', and 'bytes' are not allowed, as they can corrupt numerical
data. Default: 'ASCII'
@@ -365,16 +393,6 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
memmap([4, 5, 6])
"""
- own_fid = False
- if isinstance(file, basestring):
- fid = open(file, "rb")
- own_fid = True
- elif is_pathlib_path(file):
- fid = file.open("rb")
- own_fid = True
- else:
- fid = file
-
if encoding not in ('ASCII', 'latin1', 'bytes'):
# The 'encoding' value for pickle also affects what encoding
# the serialized binary data of NumPy arrays is loaded
@@ -395,21 +413,30 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
# Nothing to do on Python 2
pickle_kwargs = {}
+ # TODO: Use contextlib.ExitStack once we drop Python 2
+ if hasattr(file, 'read'):
+ fid = file
+ own_fid = False
+ else:
+ fid = open(os_fspath(file), "rb")
+ own_fid = True
+
try:
# Code to distinguish from NumPy binary files and pickles.
- _ZIP_PREFIX = asbytes('PK\x03\x04')
+ _ZIP_PREFIX = b'PK\x03\x04'
+ _ZIP_SUFFIX = b'PK\x05\x06' # empty zip files start with this
N = len(format.MAGIC_PREFIX)
magic = fid.read(N)
# If the file size is less than N, we need to make sure not
# to seek past the beginning of the file
fid.seek(-min(N, len(magic)), 1) # back-up
- if magic.startswith(_ZIP_PREFIX):
+ if magic.startswith(_ZIP_PREFIX) or magic.startswith(_ZIP_SUFFIX):
# zip-file (assume .npz)
# Transfer file ownership to NpzFile
- tmp = own_fid
+ ret = NpzFile(fid, own_fid=own_fid, allow_pickle=allow_pickle,
+ pickle_kwargs=pickle_kwargs)
own_fid = False
- return NpzFile(fid, own_fid=tmp, allow_pickle=allow_pickle,
- pickle_kwargs=pickle_kwargs)
+ return ret
elif magic == format.MAGIC_PREFIX:
# .npy file
if mmap_mode:
@@ -420,11 +447,11 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
else:
# Try a pickle
if not allow_pickle:
- raise ValueError("allow_pickle=False, but file does not contain "
- "non-pickled data")
+ raise ValueError("Cannot load file containing pickled data "
+ "when allow_pickle=False")
try:
return pickle.load(fid, **pickle_kwargs)
- except:
+ except Exception:
raise IOError(
"Failed to interpret file %s as a pickle" % repr(file))
finally:
@@ -432,6 +459,11 @@ def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True,
fid.close()
+def _save_dispatcher(file, arr, allow_pickle=None, fix_imports=None):
+ return (arr,)
+
+
+@array_function_dispatch(_save_dispatcher)
def save(file, arr, allow_pickle=True, fix_imports=True):
"""
Save an array to a binary file in NumPy ``.npy`` format.
@@ -443,6 +475,8 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
then the filename is unchanged. If file is a string or Path, a ``.npy``
extension will be appended to the file name if it does not already
have one.
+ arr : array_like
+ Array data to be saved.
allow_pickle : bool, optional
Allow saving object arrays using Python pickles. Reasons for disallowing
pickles include security (loading pickled data can execute arbitrary
@@ -456,8 +490,6 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
pickled in a Python 2 compatible way. If `fix_imports` is True, pickle
will try to map the new Python 3 names to the old module names used in
Python 2, so that the pickle data stream is readable with Python 2.
- arr : array_like
- Array data to be saved.
See Also
--------
@@ -466,9 +498,7 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
Notes
-----
- For a description of the ``.npy`` format, see the module docstring
- of `numpy.lib.format` or the NumPy Enhancement Proposal
- http://docs.scipy.org/doc/numpy/neps/npy-format.html
+ For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
Examples
--------
@@ -478,24 +508,20 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
>>> x = np.arange(10)
>>> np.save(outfile, x)
- >>> outfile.seek(0) # Only needed here to simulate closing & reopening file
+ >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file
>>> np.load(outfile)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
"""
own_fid = False
- if isinstance(file, basestring):
+ if hasattr(file, 'read'):
+ fid = file
+ else:
+ file = os_fspath(file)
if not file.endswith('.npy'):
file = file + '.npy'
fid = open(file, "wb")
own_fid = True
- elif is_pathlib_path(file):
- if not file.name.endswith('.npy'):
- file = file.parent / (file.name + '.npy')
- fid = file.open("wb")
- own_fid = True
- else:
- fid = file
if sys.version_info[0] >= 3:
pickle_kwargs = dict(fix_imports=fix_imports)
@@ -512,6 +538,14 @@ def save(file, arr, allow_pickle=True, fix_imports=True):
fid.close()
+def _savez_dispatcher(file, *args, **kwds):
+ for a in args:
+ yield a
+ for v in kwds.values():
+ yield v
+
+
+@array_function_dispatch(_savez_dispatcher)
def savez(file, *args, **kwds):
"""
Save several arrays into a single file in uncompressed ``.npz`` format.
@@ -552,9 +586,7 @@ def savez(file, *args, **kwds):
The ``.npz`` file format is a zipped archive of files named after the
variables they contain. The archive is not compressed and each file
in the archive contains one variable in ``.npy`` format. For a
- description of the ``.npy`` format, see `numpy.lib.format` or the
- NumPy Enhancement Proposal
- http://docs.scipy.org/doc/numpy/neps/npy-format.html
+ description of the ``.npy`` format, see :py:mod:`numpy.lib.format`.
When opening the saved ``.npz`` file with `load` a `NpzFile` object is
returned. This is a dictionary-like object which can be queried for
@@ -571,10 +603,10 @@ def savez(file, *args, **kwds):
Using `savez` with \\*args, the arrays are saved with default names.
>>> np.savez(outfile, x, y)
- >>> outfile.seek(0) # Only needed here to simulate closing & reopening file
+ >>> _ = outfile.seek(0) # Only needed here to simulate closing & reopening file
>>> npzfile = np.load(outfile)
>>> npzfile.files
- ['arr_1', 'arr_0']
+ ['arr_0', 'arr_1']
>>> npzfile['arr_0']
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
@@ -582,10 +614,10 @@ def savez(file, *args, **kwds):
>>> outfile = TemporaryFile()
>>> np.savez(outfile, x=x, y=y)
- >>> outfile.seek(0)
+ >>> _ = outfile.seek(0)
>>> npzfile = np.load(outfile)
- >>> npzfile.files
- ['y', 'x']
+ >>> sorted(npzfile.files)
+ ['x', 'y']
>>> npzfile['x']
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
@@ -593,6 +625,14 @@ def savez(file, *args, **kwds):
_savez(file, args, kwds, False)
+def _savez_compressed_dispatcher(file, *args, **kwds):
+ for a in args:
+ yield a
+ for v in kwds.values():
+ yield v
+
+
+@array_function_dispatch(_savez_compressed_dispatcher)
def savez_compressed(file, *args, **kwds):
"""
Save several arrays into a single file in compressed ``.npz`` format.
@@ -633,9 +673,9 @@ def savez_compressed(file, *args, **kwds):
The ``.npz`` file format is a zipped archive of files named after the
variables they contain. The archive is compressed with
``zipfile.ZIP_DEFLATED`` and each file in the archive contains one variable
- in ``.npy`` format. For a description of the ``.npy`` format, see
- `numpy.lib.format` or the NumPy Enhancement Proposal
- http://docs.scipy.org/doc/numpy/neps/npy-format.html
+ in ``.npy`` format. For a description of the ``.npy`` format, see
+ :py:mod:`numpy.lib.format`.
+
When opening the saved ``.npz`` file with `load` a `NpzFile` object is
returned. This is a dictionary-like object which can be queried for
@@ -661,15 +701,11 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
# Import is postponed to here since zipfile depends on gzip, an optional
# component of the so-called standard library.
import zipfile
- # Import deferred for startup time improvement
- import tempfile
- if isinstance(file, basestring):
+ if not hasattr(file, 'read'):
+ file = os_fspath(file)
if not file.endswith('.npz'):
file = file + '.npz'
- elif is_pathlib_path(file):
- if not file.name.endswith('.npz'):
- file = file.parent / (file.name + '.npz')
namedict = kwds
for i, val in enumerate(args):
@@ -686,31 +722,44 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf = zipfile_factory(file, mode="w", compression=compression)
- # Stage arrays in a temporary file on disk, before writing to zip.
-
- # Since target file might be big enough to exceed capacity of a global
- # temporary directory, create temp file side-by-side with the target file.
- file_dir, file_prefix = os.path.split(file) if _is_string_like(file) else (None, 'tmp')
- fd, tmpfile = tempfile.mkstemp(prefix=file_prefix, dir=file_dir, suffix='-numpy.npy')
- os.close(fd)
- try:
+ if sys.version_info >= (3, 6):
+ # Since Python 3.6 it is possible to write directly to a ZIP file.
for key, val in namedict.items():
fname = key + '.npy'
- fid = open(tmpfile, 'wb')
- try:
- format.write_array(fid, np.asanyarray(val),
+ val = np.asanyarray(val)
+ force_zip64 = val.nbytes >= 2**30
+ with zipf.open(fname, 'w', force_zip64=force_zip64) as fid:
+ format.write_array(fid, val,
allow_pickle=allow_pickle,
pickle_kwargs=pickle_kwargs)
- fid.close()
- fid = None
- zipf.write(tmpfile, arcname=fname)
- except IOError as exc:
- raise IOError("Failed to write to %s: %s" % (tmpfile, exc))
- finally:
- if fid:
+ else:
+ # Stage arrays in a temporary file on disk, before writing to zip.
+
+ # Import deferred for startup time improvement
+ import tempfile
+ # Since target file might be big enough to exceed capacity of a global
+ # temporary directory, create temp file side-by-side with the target file.
+ file_dir, file_prefix = os.path.split(file) if _is_string_like(file) else (None, 'tmp')
+ fd, tmpfile = tempfile.mkstemp(prefix=file_prefix, dir=file_dir, suffix='-numpy.npy')
+ os.close(fd)
+ try:
+ for key, val in namedict.items():
+ fname = key + '.npy'
+ fid = open(tmpfile, 'wb')
+ try:
+ format.write_array(fid, np.asanyarray(val),
+ allow_pickle=allow_pickle,
+ pickle_kwargs=pickle_kwargs)
fid.close()
- finally:
- os.remove(tmpfile)
+ fid = None
+ zipf.write(tmpfile, arcname=fname)
+ except IOError as exc:
+ raise IOError("Failed to write to %s: %s" % (tmpfile, exc))
+ finally:
+ if fid:
+ fid.close()
+ finally:
+ os.remove(tmpfile)
zipf.close()
@@ -720,8 +769,8 @@ def _getconv(dtype):
def floatconv(x):
x.lower()
- if b'0x' in x:
- return float.fromhex(asstr(x))
+ if '0x' in x:
+ return float.fromhex(x)
return float(x)
typ = dtype.type
@@ -737,17 +786,23 @@ def _getconv(dtype):
return np.longdouble
elif issubclass(typ, np.floating):
return floatconv
- elif issubclass(typ, np.complex):
- return lambda x: complex(asstr(x))
+ elif issubclass(typ, complex):
+ return lambda x: complex(asstr(x).replace('+-', '-'))
elif issubclass(typ, np.bytes_):
return asbytes
+ elif issubclass(typ, np.unicode_):
+ return asunicode
else:
return asstr
+# amount of lines loadtxt reads in one chunk, can be overridden for testing
+_loadtxt_chunksize = 50000
+
+@set_module('numpy')
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
converters=None, skiprows=0, usecols=None, unpack=False,
- ndmin=0):
+ ndmin=0, encoding='bytes', max_rows=None):
"""
Load data from a text file.
@@ -765,33 +820,31 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
each row will be interpreted as an element of the array. In this
case, the number of columns used must match the number of fields in
the data-type.
- comments : str or sequence, optional
+ comments : str or sequence of str, optional
The characters or list of characters used to indicate the start of a
- comment;
- default: '#'.
+ comment. None implies no comments. For backwards compatibility, byte
+ strings will be decoded as 'latin1'. The default is '#'.
delimiter : str, optional
- The string used to separate values. By default, this is any
- whitespace.
+ The string used to separate values. For backwards compatibility, byte
+ strings will be decoded as 'latin1'. The default is whitespace.
converters : dict, optional
- A dictionary mapping column number to a function that will convert
- that column to a float. E.g., if column 0 is a date string:
- ``converters = {0: datestr2num}``. Converters can also be used to
- provide a default value for missing data (but see also `genfromtxt`):
- ``converters = {3: lambda s: float(s.strip() or 0)}``. Default: None.
+ A dictionary mapping column number to a function that will parse the
+ column string into the desired value. E.g., if column 0 is a date
+ string: ``converters = {0: datestr2num}``. Converters can also be
+ used to provide a default value for missing data (but see also
+ `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``.
+ Default: None.
skiprows : int, optional
- Skip the first `skiprows` lines; default: 0.
-
+ Skip the first `skiprows` lines, including comments; default: 0.
usecols : int or sequence, optional
Which columns to read, with 0 being the first. For example,
- usecols = (1,4,5) will extract the 2nd, 5th and 6th columns.
+ ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns.
The default, None, results in all columns being read.
- .. versionadded:: 1.11.0
-
- Also when a single column has to be read it is possible to use
- an integer instead of a tuple. E.g ``usecols = 3`` reads the
- fourth column the same way as `usecols = (3,)`` would.
-
+ .. versionchanged:: 1.11.0
+ When a single column has to be read it is possible to use
+ an integer instead of a tuple. E.g ``usecols = 3`` reads the
+ fourth column the same way as ``usecols = (3,)`` would.
unpack : bool, optional
If True, the returned array is transposed, so that arguments may be
unpacked using ``x, y, z = loadtxt(...)``. When used with a structured
@@ -802,6 +855,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
Legal values: 0 (default), 1 or 2.
.. versionadded:: 1.6.0
+ encoding : str, optional
+ Encoding used to decode the inputfile. Does not apply to input streams.
+ The special value 'bytes' enables backward compatibility workarounds
+ that ensures you receive byte arrays as results if possible and passes
+ 'latin1' encoded strings to converters. Override this value to receive
+ unicode arrays and pass strings as input to converters. If set to None
+ the system default is used. The default value is 'bytes'.
+
+ .. versionadded:: 1.14.0
+ max_rows : int, optional
+ Read `max_rows` lines of content after `skiprows` lines. The default
+ is to read all the lines.
+
+ .. versionadded:: 1.16.0
Returns
-------
@@ -828,38 +895,44 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
Examples
--------
>>> from io import StringIO # StringIO behaves like a file object
- >>> c = StringIO("0 1\\n2 3")
+ >>> c = StringIO(u"0 1\\n2 3")
>>> np.loadtxt(c)
- array([[ 0., 1.],
- [ 2., 3.]])
+ array([[0., 1.],
+ [2., 3.]])
- >>> d = StringIO("M 21 72\\nF 35 58")
+ >>> d = StringIO(u"M 21 72\\nF 35 58")
>>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'),
... 'formats': ('S1', 'i4', 'f4')})
- array([('M', 21, 72.0), ('F', 35, 58.0)],
- dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')])
+ array([(b'M', 21, 72.), (b'F', 35, 58.)],
+ dtype=[('gender', 'S1'), ('age', '<i4'), ('weight', '<f4')])
- >>> c = StringIO("1,0,2\\n3,0,4")
+ >>> c = StringIO(u"1,0,2\\n3,0,4")
>>> x, y = np.loadtxt(c, delimiter=',', usecols=(0, 2), unpack=True)
>>> x
- array([ 1., 3.])
+ array([1., 3.])
>>> y
- array([ 2., 4.])
+ array([2., 4.])
"""
# Type conversions for Py3 convenience
if comments is not None:
if isinstance(comments, (basestring, bytes)):
- comments = [asbytes(comments)]
- else:
- comments = [asbytes(comment) for comment in comments]
-
+ comments = [comments]
+ comments = [_decode_line(x) for x in comments]
# Compile regex for comments beforehand
comments = (re.escape(comment) for comment in comments)
- regex_comments = re.compile(asbytes('|').join(comments))
- user_converters = converters
+ regex_comments = re.compile('|'.join(comments))
+
if delimiter is not None:
- delimiter = asbytes(delimiter)
+ delimiter = _decode_line(delimiter)
+
+ user_converters = converters
+
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+ else:
+ byte_converters = False
if usecols is not None:
# Allow usecols to be a single int or a sequence of ints
@@ -882,27 +955,31 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
fown = False
try:
- if is_pathlib_path(fname):
- fname = str(fname)
+ if isinstance(fname, os_PathLike):
+ fname = os_fspath(fname)
if _is_string_like(fname):
+ fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ fencoding = getattr(fh, 'encoding', 'latin1')
+ fh = iter(fh)
fown = True
- if fname.endswith('.gz'):
- import gzip
- fh = iter(gzip.GzipFile(fname))
- elif fname.endswith('.bz2'):
- import bz2
- fh = iter(bz2.BZ2File(fname))
- elif sys.version_info[0] == 2:
- fh = iter(open(fname, 'U'))
- else:
- fh = iter(open(fname))
else:
fh = iter(fname)
+ fencoding = getattr(fname, 'encoding', 'latin1')
except TypeError:
raise ValueError('fname must be a string, file handle, or generator')
- X = []
- def flatten_dtype(dt):
+ # input may be a python2 io stream
+ if encoding is not None:
+ fencoding = encoding
+ # we must assume local encoding
+ # TODO emit portability warning?
+ elif fencoding is None:
+ import locale
+ fencoding = locale.getpreferredencoding()
+
+ # not to be confused with the flatten_dtype we import...
+ @recursive
+ def flatten_dtype_internal(self, dt):
"""Unpack a structured data-type, and produce re-packing info."""
if dt.names is None:
# If the dtype is flattened, return.
@@ -922,7 +999,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
packing = []
for field in dt.names:
tp, bytes = dt.fields[field]
- flat_dt, flat_packing = flatten_dtype(tp)
+ flat_dt, flat_packing = self(tp)
types.extend(flat_dt)
# Avoid extra nesting for subarrays
if tp.ndim > 0:
@@ -931,7 +1008,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
packing.append((len(flat_dt), flat_packing))
return (types, packing)
- def pack_items(items, packing):
+ @recursive
+ def pack_items(self, items, packing):
"""Pack items into nested lists based on re-packing info."""
if packing is None:
return items[0]
@@ -943,26 +1021,60 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
start = 0
ret = []
for length, subpacking in packing:
- ret.append(pack_items(items[start:start+length], subpacking))
+ ret.append(self(items[start:start+length], subpacking))
start += length
return tuple(ret)
def split_line(line):
- """Chop off comments, strip, and split at delimiter.
-
- Note that although the file is opened as text, this function
- returns bytes.
+ """Chop off comments, strip, and split at delimiter. """
+ line = _decode_line(line, encoding=encoding)
- """
- line = asbytes(line)
if comments is not None:
- line = regex_comments.split(asbytes(line), maxsplit=1)[0]
- line = line.strip(asbytes('\r\n'))
+ line = regex_comments.split(line, maxsplit=1)[0]
+ line = line.strip('\r\n')
if line:
return line.split(delimiter)
else:
return []
+ def read_data(chunk_size):
+ """Parse each line, including the first.
+
+ The file read, `fh`, is a global defined above.
+
+ Parameters
+ ----------
+ chunk_size : int
+ At most `chunk_size` lines are read at a time, with iteration
+ until all lines are read.
+
+ """
+ X = []
+ line_iter = itertools.chain([first_line], fh)
+ line_iter = itertools.islice(line_iter, max_rows)
+ for i, line in enumerate(line_iter):
+ vals = split_line(line)
+ if len(vals) == 0:
+ continue
+ if usecols:
+ vals = [vals[j] for j in usecols]
+ if len(vals) != N:
+ line_num = i + skiprows + 1
+ raise ValueError("Wrong number of columns at line %d"
+ % line_num)
+
+ # Convert each value according to its column and store
+ items = [conv(val) for (conv, val) in zip(converters, vals)]
+
+ # Then pack it according to the dtype's nesting
+ items = pack_items(items, packing)
+ X.append(items)
+ if len(X) > chunk_size:
+ yield X
+ X = []
+ if X:
+ yield X
+
try:
# Make sure we're dealing with a proper dtype
dtype = np.dtype(dtype)
@@ -986,7 +1098,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
warnings.warn('loadtxt: Empty input file: "%s"' % fname, stacklevel=2)
N = len(usecols or first_vals)
- dtype_types, packing = flatten_dtype(dtype)
+ dtype_types, packing = flatten_dtype_internal(dtype)
if len(dtype_types) > 1:
# We're dealing with a structured array, each field of
# the dtype matches a column
@@ -1005,30 +1117,41 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
except ValueError:
# Unused converter specified
continue
- converters[i] = conv
-
- # Parse each line, including the first
- for i, line in enumerate(itertools.chain([first_line], fh)):
- vals = split_line(line)
- if len(vals) == 0:
- continue
- if usecols:
- vals = [vals[i] for i in usecols]
- if len(vals) != N:
- line_num = i + skiprows + 1
- raise ValueError("Wrong number of columns at line %d"
- % line_num)
-
- # Convert each value according to its column and store
- items = [conv(val) for (conv, val) in zip(converters, vals)]
- # Then pack it according to the dtype's nesting
- items = pack_items(items, packing)
- X.append(items)
+ if byte_converters:
+ # converters may use decode to workaround numpy's old behaviour,
+ # so encode the string again before passing to the user converter
+ def tobytes_first(x, conv):
+ if type(x) is bytes:
+ return conv(x)
+ return conv(x.encode("latin1"))
+ converters[i] = functools.partial(tobytes_first, conv=conv)
+ else:
+ converters[i] = conv
+
+ converters = [conv if conv is not bytes else
+ lambda x: x.encode(fencoding) for conv in converters]
+
+ # read data in chunks and fill it into an array via resize
+ # over-allocating and shrinking the array later may be faster but is
+ # probably not relevant compared to the cost of actually reading and
+ # converting the data
+ X = None
+ for x in read_data(_loadtxt_chunksize):
+ if X is None:
+ X = np.array(x, dtype)
+ else:
+ nshape = list(X.shape)
+ pos = nshape[0]
+ nshape[0] += len(x)
+ X.resize(nshape, refcheck=False)
+ X[pos:, ...] = x
finally:
if fown:
fh.close()
- X = np.array(X, dtype)
+ if X is None:
+ X = np.array([], dtype)
+
# Multicolumn data are returned with shape (1, N, M), i.e.
# (1, 1, M) for a single row - remove the singleton dimension there
if X.ndim == 3 and X.shape[:2] == (1, 1):
@@ -1059,8 +1182,15 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
return X
+def _savetxt_dispatcher(fname, X, fmt=None, delimiter=None, newline=None,
+ header=None, footer=None, comments=None,
+ encoding=None):
+ return (X,)
+
+
+@array_function_dispatch(_savetxt_dispatcher)
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
- footer='', comments='# '):
+ footer='', comments='# ', encoding=None):
"""
Save an array to a text file.
@@ -1070,20 +1200,21 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
If the filename ends in ``.gz``, the file is automatically saved in
compressed gzip format. `loadtxt` understands gzipped files
transparently.
- X : array_like
+ X : 1D or 2D array_like
Data to be saved to a text file.
fmt : str or sequence of strs, optional
A single format (%10.5f), a sequence of formats, or a
multi-format string, e.g. 'Iteration %d -- %10.5f', in which
case `delimiter` is ignored. For complex `X`, the legal options
for `fmt` are:
- a) a single specifier, `fmt='%.4e'`, resulting in numbers formatted
- like `' (%s+%sj)' % (fmt, fmt)`
- b) a full string specifying every real and imaginary part, e.g.
- `' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'` for 3 columns
- c) a list of specifiers, one per column - in this case, the real
- and imaginary part must have separate specifiers,
- e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns
+
+ * a single specifier, `fmt='%.4e'`, resulting in numbers formatted
+ like `' (%s+%sj)' % (fmt, fmt)`
+ * a full string specifying every real and imaginary part, e.g.
+ `' %.4e %+.4ej %.4e %+.4ej %.4e %+.4ej'` for 3 columns
+ * a list of specifiers, one per column - in this case, the real
+ and imaginary part must have separate specifiers,
+ e.g. `['%.3e + %.3ej', '(%.15e%+.15ej)']` for 2 columns
delimiter : str, optional
String or character separating columns.
newline : str, optional
@@ -1104,6 +1235,13 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
``numpy.loadtxt``.
.. versionadded:: 1.7.0
+ encoding : {None, str}, optional
+ Encoding used to encode the outputfile. Does not apply to output
+ streams. If the encoding is something other than 'bytes' or 'latin1'
+ you will not be able to load the file in NumPy versions < 1.14. Default
+ is 'latin1'.
+
+ .. versionadded:: 1.14.0
See Also
@@ -1161,8 +1299,8 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
References
----------
.. [1] `Format Specification Mini-Language
- <http://docs.python.org/library/string.html#
- format-specification-mini-language>`_, Python Documentation.
+ <https://docs.python.org/library/string.html#format-specification-mini-language>`_,
+ Python Documentation.
Examples
--------
@@ -1178,21 +1316,53 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
fmt = asstr(fmt)
delimiter = asstr(delimiter)
+ class WriteWrap(object):
+ """Convert to unicode in py2 or to bytes on bytestream inputs.
+
+ """
+ def __init__(self, fh, encoding):
+ self.fh = fh
+ self.encoding = encoding
+ self.do_write = self.first_write
+
+ def close(self):
+ self.fh.close()
+
+ def write(self, v):
+ self.do_write(v)
+
+ def write_bytes(self, v):
+ if isinstance(v, bytes):
+ self.fh.write(v)
+ else:
+ self.fh.write(v.encode(self.encoding))
+
+ def write_normal(self, v):
+ self.fh.write(asunicode(v))
+
+ def first_write(self, v):
+ try:
+ self.write_normal(v)
+ self.write = self.write_normal
+ except TypeError:
+ # input is probably a bytestream
+ self.write_bytes(v)
+ self.write = self.write_bytes
+
own_fh = False
- if is_pathlib_path(fname):
- fname = str(fname)
+ if isinstance(fname, os_PathLike):
+ fname = os_fspath(fname)
if _is_string_like(fname):
+ # datasource doesn't support creating a new file ...
+ open(fname, 'wt').close()
+ fh = np.lib._datasource.open(fname, 'wt', encoding=encoding)
own_fh = True
- if fname.endswith('.gz'):
- import gzip
- fh = gzip.open(fname, 'wb')
- else:
- if sys.version_info[0] >= 3:
- fh = open(fname, 'wb')
- else:
- fh = open(fname, 'w')
+ # need to convert str to unicode for text io output
+ if sys.version_info[0] == 2:
+ fh = WriteWrap(fh, encoding or 'latin1')
elif hasattr(fname, 'write'):
- fh = fname
+ # wrap to handle byte output streams
+ fh = WriteWrap(fname, encoding or 'latin1')
else:
raise ValueError('fname must be a string or file handle')
@@ -1200,7 +1370,10 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
X = np.asarray(X)
# Handle 1-dimensional arrays
- if X.ndim == 1:
+ if X.ndim == 0 or X.ndim > 2:
+ raise ValueError(
+ "Expected 1D or 2D array, got %dD array instead" % X.ndim)
+ elif X.ndim == 1:
# Common case -- 1d array of numbers
if X.dtype.names is None:
X = np.atleast_2d(X).T
@@ -1208,7 +1381,7 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
# Complex dtype -- each field indicates a separate column
else:
- ncol = len(X.dtype.descr)
+ ncol = len(X.dtype.names)
else:
ncol = X.shape[1]
@@ -1239,31 +1412,35 @@ def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='',
if len(header) > 0:
header = header.replace('\n', '\n' + comments)
- fh.write(asbytes(comments + header + newline))
+ fh.write(comments + header + newline)
if iscomplex_X:
for row in X:
row2 = []
for number in row:
row2.append(number.real)
row2.append(number.imag)
- fh.write(asbytes(format % tuple(row2) + newline))
+ s = format % tuple(row2) + newline
+ fh.write(s.replace('+-', '-'))
else:
for row in X:
try:
- fh.write(asbytes(format % tuple(row) + newline))
+ v = format % tuple(row) + newline
except TypeError:
raise TypeError("Mismatch between array dtype ('%s') and "
"format specifier ('%s')"
% (str(X.dtype), format))
+ fh.write(v)
+
if len(footer) > 0:
footer = footer.replace('\n', '\n' + comments)
- fh.write(asbytes(comments + footer + newline))
+ fh.write(comments + footer + newline)
finally:
if own_fh:
fh.close()
-def fromregex(file, regexp, dtype):
+@set_module('numpy')
+def fromregex(file, regexp, dtype, encoding=None):
"""
Construct an array from a text file, using regular expression parsing.
@@ -1280,6 +1457,10 @@ def fromregex(file, regexp, dtype):
Groups in the regular expression correspond to fields in the dtype.
dtype : dtype or list of dtypes
Dtype for the structured array.
+ encoding : str, optional
+ Encoding used to decode the inputfile. Does not apply to input streams.
+
+ .. versionadded:: 1.14.0
Returns
-------
@@ -1305,31 +1486,37 @@ def fromregex(file, regexp, dtype):
Examples
--------
>>> f = open('test.dat', 'w')
- >>> f.write("1312 foo\\n1534 bar\\n444 qux")
+ >>> _ = f.write("1312 foo\\n1534 bar\\n444 qux")
>>> f.close()
>>> regexp = r"(\\d+)\\s+(...)" # match [digits, whitespace, anything]
>>> output = np.fromregex('test.dat', regexp,
... [('num', np.int64), ('key', 'S3')])
>>> output
- array([(1312L, 'foo'), (1534L, 'bar'), (444L, 'qux')],
- dtype=[('num', '<i8'), ('key', '|S3')])
+ array([(1312, b'foo'), (1534, b'bar'), ( 444, b'qux')],
+ dtype=[('num', '<i8'), ('key', 'S3')])
>>> output['num']
- array([1312, 1534, 444], dtype=int64)
+ array([1312, 1534, 444])
"""
own_fh = False
if not hasattr(file, "read"):
- file = open(file, 'rb')
+ file = np.lib._datasource.open(file, 'rt', encoding=encoding)
own_fh = True
try:
- if not hasattr(regexp, 'match'):
- regexp = re.compile(asbytes(regexp))
if not isinstance(dtype, np.dtype):
dtype = np.dtype(dtype)
- seq = regexp.findall(file.read())
+ content = file.read()
+ if isinstance(content, bytes) and isinstance(regexp, np.unicode):
+ regexp = asbytes(regexp)
+ elif isinstance(content, np.unicode) and isinstance(regexp, bytes):
+ regexp = asstr(regexp)
+
+ if not hasattr(regexp, 'match'):
+ regexp = re.compile(regexp)
+ seq = regexp.findall(content)
if seq and not isinstance(seq[0], tuple):
# Only one group is in the regexp.
# Create the new array as a single data-type and then
@@ -1351,13 +1538,14 @@ def fromregex(file, regexp, dtype):
#####--------------------------------------------------------------------------
+@set_module('numpy')
def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
skip_header=0, skip_footer=0, converters=None,
missing_values=None, filling_values=None, usecols=None,
names=None, excludelist=None, deletechars=None,
replace_space='_', autostrip=False, case_sensitive=True,
defaultfmt="f%i", unpack=None, usemask=False, loose=True,
- invalid_raise=True, max_rows=None):
+ invalid_raise=True, max_rows=None, encoding='bytes'):
"""
Load data from a text file, with missing values handled as specified.
@@ -1403,11 +1591,12 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
Which columns to read, with 0 being the first. For example,
``usecols = (1, 4, 5)`` will extract the 2nd, 5th and 6th columns.
names : {None, True, str, sequence}, optional
- If `names` is True, the field names are read from the first valid line
- after the first `skip_header` lines.
- If `names` is a sequence or a single-string of comma-separated names,
- the names will be used to define the field names in a structured dtype.
- If `names` is None, the names of the dtype fields will be used, if any.
+ If `names` is True, the field names are read from the first line after
+ the first `skip_header` lines. This line can optionally be proceeded
+ by a comment delimiter. If `names` is a sequence or a single-string of
+ comma-separated names, the names will be used to define the field names
+ in a structured dtype. If `names` is None, the names of the dtype
+ fields will be used, if any.
excludelist : sequence, optional
A list of names to exclude. This list is appended to the default list
['return','file','print']. Excluded names are appended an underscore:
@@ -1444,6 +1633,15 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
to read the entire file.
.. versionadded:: 1.10.0
+ encoding : str, optional
+ Encoding used to decode the inputfile. Does not apply when `fname` is
+ a file object. The special value 'bytes' enables backward compatibility
+ workarounds that ensure that you receive byte arrays when possible
+ and passes latin1 encoded strings to converters. Override this value to
+ receive unicode arrays and pass strings as input to converters. If set
+ to None the system default is used. The default value is 'bytes'.
+
+ .. versionadded:: 1.14.0
Returns
-------
@@ -1468,7 +1666,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
References
----------
.. [1] NumPy User Guide, section `I/O with NumPy
- <http://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
+ <https://docs.scipy.org/doc/numpy/user/basics.io.genfromtxt.html>`_.
Examples
---------
@@ -1477,39 +1675,39 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
Comma delimited file with mixed dtype
- >>> s = StringIO("1,1.3,abcde")
+ >>> s = StringIO(u"1,1.3,abcde")
>>> data = np.genfromtxt(s, dtype=[('myint','i8'),('myfloat','f8'),
... ('mystring','S5')], delimiter=",")
>>> data
- array((1, 1.3, 'abcde'),
- dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
+ array((1, 1.3, b'abcde'),
+ dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
Using dtype = None
- >>> s.seek(0) # needed for StringIO example only
+ >>> _ = s.seek(0) # needed for StringIO example only
>>> data = np.genfromtxt(s, dtype=None,
... names = ['myint','myfloat','mystring'], delimiter=",")
>>> data
- array((1, 1.3, 'abcde'),
- dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
+ array((1, 1.3, b'abcde'),
+ dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
Specifying dtype and names
- >>> s.seek(0)
+ >>> _ = s.seek(0)
>>> data = np.genfromtxt(s, dtype="i8,f8,S5",
... names=['myint','myfloat','mystring'], delimiter=",")
>>> data
- array((1, 1.3, 'abcde'),
- dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', '|S5')])
+ array((1, 1.3, b'abcde'),
+ dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])
An example with fixed-width columns
- >>> s = StringIO("11.3abcde")
+ >>> s = StringIO(u"11.3abcde")
>>> data = np.genfromtxt(s, dtype=None, names=['intvar','fltvar','strvar'],
... delimiter=[1,3,5])
>>> data
- array((1, 1.3, 'abcde'),
- dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', '|S5')])
+ array((1, 1.3, b'abcde'),
+ dtype=[('intvar', '<i8'), ('fltvar', '<f8'), ('strvar', 'S5')])
"""
if max_rows is not None:
@@ -1520,15 +1718,6 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
if max_rows < 1:
raise ValueError("'max_rows' must be at least 1.")
- # Py3 data conversions to bytes, for convenience
- if comments is not None:
- comments = asbytes(comments)
- if isinstance(delimiter, unicode):
- delimiter = asbytes(delimiter)
- if isinstance(missing_values, (unicode, list, tuple)):
- missing_values = asbytes_nested(missing_values)
-
- #
if usemask:
from numpy.ma import MaskedArray, make_mask_descr
# Check the input dictionary of converters
@@ -1538,290 +1727,306 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
"The input argument 'converter' should be a valid dictionary "
"(got '%s' instead)" % type(user_converters))
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+ else:
+ byte_converters = False
+
# Initialize the filehandle, the LineSplitter and the NameValidator
- own_fhd = False
try:
- if is_pathlib_path(fname):
- fname = str(fname)
+ if isinstance(fname, os_PathLike):
+ fname = os_fspath(fname)
if isinstance(fname, basestring):
- if sys.version_info[0] == 2:
- fhd = iter(np.lib._datasource.open(fname, 'rbU'))
- else:
- fhd = iter(np.lib._datasource.open(fname, 'rb'))
- own_fhd = True
+ fid = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ fid_ctx = contextlib.closing(fid)
else:
- fhd = iter(fname)
+ fid = fname
+ fid_ctx = contextlib_nullcontext(fid)
+ fhd = iter(fid)
except TypeError:
raise TypeError(
"fname must be a string, filehandle, list of strings, "
"or generator. Got %s instead." % type(fname))
- split_line = LineSplitter(delimiter=delimiter, comments=comments,
- autostrip=autostrip)._handyman
- validate_names = NameValidator(excludelist=excludelist,
- deletechars=deletechars,
- case_sensitive=case_sensitive,
- replace_space=replace_space)
+ with fid_ctx:
+ split_line = LineSplitter(delimiter=delimiter, comments=comments,
+ autostrip=autostrip, encoding=encoding)
+ validate_names = NameValidator(excludelist=excludelist,
+ deletechars=deletechars,
+ case_sensitive=case_sensitive,
+ replace_space=replace_space)
- # Skip the first `skip_header` rows
- for i in range(skip_header):
- next(fhd)
+ # Skip the first `skip_header` rows
+ for i in range(skip_header):
+ next(fhd)
- # Keep on until we find the first valid values
- first_values = None
- try:
- while not first_values:
- first_line = next(fhd)
- if names is True:
- if comments in first_line:
- first_line = (
- asbytes('').join(first_line.split(comments)[1:]))
- first_values = split_line(first_line)
- except StopIteration:
- # return an empty array if the datafile is empty
- first_line = asbytes('')
- first_values = []
- warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
-
- # Should we take the first values as names ?
- if names is True:
- fval = first_values[0].strip()
- if fval in comments:
- del first_values[0]
-
- # Check the columns to use: make sure `usecols` is a list
- if usecols is not None:
+ # Keep on until we find the first valid values
+ first_values = None
try:
- usecols = [_.strip() for _ in usecols.split(",")]
- except AttributeError:
+ while not first_values:
+ first_line = _decode_line(next(fhd), encoding)
+ if (names is True) and (comments is not None):
+ if comments in first_line:
+ first_line = (
+ ''.join(first_line.split(comments)[1:]))
+ first_values = split_line(first_line)
+ except StopIteration:
+ # return an empty array if the datafile is empty
+ first_line = ''
+ first_values = []
+ warnings.warn('genfromtxt: Empty input file: "%s"' % fname, stacklevel=2)
+
+ # Should we take the first values as names ?
+ if names is True:
+ fval = first_values[0].strip()
+ if comments is not None:
+ if fval in comments:
+ del first_values[0]
+
+ # Check the columns to use: make sure `usecols` is a list
+ if usecols is not None:
try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols, ]
- nbcols = len(usecols or first_values)
-
- # Check the names and overwrite the dtype.names if needed
- if names is True:
- names = validate_names([_bytes_to_name(_.strip())
- for _ in first_values])
- first_line = asbytes('')
- elif _is_string_like(names):
- names = validate_names([_.strip() for _ in names.split(',')])
- elif names:
- names = validate_names(names)
- # Get the dtype
- if dtype is not None:
- dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names,
- excludelist=excludelist,
- deletechars=deletechars,
- case_sensitive=case_sensitive,
- replace_space=replace_space)
- # Make sure the names is a list (for 2.5)
- if names is not None:
- names = list(names)
-
- if usecols:
- for (i, current) in enumerate(usecols):
- # if usecols is a list of names, convert to a list of indices
- if _is_string_like(current):
- usecols[i] = names.index(current)
- elif current < 0:
- usecols[i] = current + len(first_values)
- # If the dtype is not None, make sure we update it
- if (dtype is not None) and (len(dtype) > nbcols):
- descr = dtype.descr
- dtype = np.dtype([descr[_] for _ in usecols])
- names = list(dtype.names)
- # If `names` is not None, update the names
- elif (names is not None) and (len(names) > nbcols):
- names = [names[_] for _ in usecols]
- elif (names is not None) and (dtype is not None):
- names = list(dtype.names)
-
- # Process the missing values ...............................
- # Rename missing_values for convenience
- user_missing_values = missing_values or ()
-
- # Define the list of missing_values (one column: one list)
- missing_values = [list([asbytes('')]) for _ in range(nbcols)]
-
- # We have a dictionary: process it field by field
- if isinstance(user_missing_values, dict):
- # Loop on the items
- for (key, val) in user_missing_values.items():
- # Is the key a string ?
- if _is_string_like(key):
+ usecols = [_.strip() for _ in usecols.split(",")]
+ except AttributeError:
try:
- # Transform it into an integer
- key = names.index(key)
- except ValueError:
- # We couldn't find it: the name must have been dropped
- continue
- # Redefine the key as needed if it's a column number
- if usecols:
- try:
- key = usecols.index(key)
- except ValueError:
- pass
- # Transform the value as a list of string
- if isinstance(val, (list, tuple)):
- val = [str(_) for _ in val]
+ usecols = list(usecols)
+ except TypeError:
+ usecols = [usecols, ]
+ nbcols = len(usecols or first_values)
+
+ # Check the names and overwrite the dtype.names if needed
+ if names is True:
+ names = validate_names([str(_.strip()) for _ in first_values])
+ first_line = ''
+ elif _is_string_like(names):
+ names = validate_names([_.strip() for _ in names.split(',')])
+ elif names:
+ names = validate_names(names)
+ # Get the dtype
+ if dtype is not None:
+ dtype = easy_dtype(dtype, defaultfmt=defaultfmt, names=names,
+ excludelist=excludelist,
+ deletechars=deletechars,
+ case_sensitive=case_sensitive,
+ replace_space=replace_space)
+ # Make sure the names is a list (for 2.5)
+ if names is not None:
+ names = list(names)
+
+ if usecols:
+ for (i, current) in enumerate(usecols):
+ # if usecols is a list of names, convert to a list of indices
+ if _is_string_like(current):
+ usecols[i] = names.index(current)
+ elif current < 0:
+ usecols[i] = current + len(first_values)
+ # If the dtype is not None, make sure we update it
+ if (dtype is not None) and (len(dtype) > nbcols):
+ descr = dtype.descr
+ dtype = np.dtype([descr[_] for _ in usecols])
+ names = list(dtype.names)
+ # If `names` is not None, update the names
+ elif (names is not None) and (len(names) > nbcols):
+ names = [names[_] for _ in usecols]
+ elif (names is not None) and (dtype is not None):
+ names = list(dtype.names)
+
+ # Process the missing values ...............................
+ # Rename missing_values for convenience
+ user_missing_values = missing_values or ()
+ if isinstance(user_missing_values, bytes):
+ user_missing_values = user_missing_values.decode('latin1')
+
+ # Define the list of missing_values (one column: one list)
+ missing_values = [list(['']) for _ in range(nbcols)]
+
+ # We have a dictionary: process it field by field
+ if isinstance(user_missing_values, dict):
+ # Loop on the items
+ for (key, val) in user_missing_values.items():
+ # Is the key a string ?
+ if _is_string_like(key):
+ try:
+ # Transform it into an integer
+ key = names.index(key)
+ except ValueError:
+ # We couldn't find it: the name must have been dropped
+ continue
+ # Redefine the key as needed if it's a column number
+ if usecols:
+ try:
+ key = usecols.index(key)
+ except ValueError:
+ pass
+ # Transform the value as a list of string
+ if isinstance(val, (list, tuple)):
+ val = [str(_) for _ in val]
+ else:
+ val = [str(val), ]
+ # Add the value(s) to the current list of missing
+ if key is None:
+ # None acts as default
+ for miss in missing_values:
+ miss.extend(val)
+ else:
+ missing_values[key].extend(val)
+ # We have a sequence : each item matches a column
+ elif isinstance(user_missing_values, (list, tuple)):
+ for (value, entry) in zip(user_missing_values, missing_values):
+ value = str(value)
+ if value not in entry:
+ entry.append(value)
+ # We have a string : apply it to all entries
+ elif isinstance(user_missing_values, basestring):
+ user_value = user_missing_values.split(",")
+ for entry in missing_values:
+ entry.extend(user_value)
+ # We have something else: apply it to all entries
+ else:
+ for entry in missing_values:
+ entry.extend([str(user_missing_values)])
+
+ # Process the filling_values ...............................
+ # Rename the input for convenience
+ user_filling_values = filling_values
+ if user_filling_values is None:
+ user_filling_values = []
+ # Define the default
+ filling_values = [None] * nbcols
+ # We have a dictionary : update each entry individually
+ if isinstance(user_filling_values, dict):
+ for (key, val) in user_filling_values.items():
+ if _is_string_like(key):
+ try:
+ # Transform it into an integer
+ key = names.index(key)
+ except ValueError:
+ # We couldn't find it: the name must have been dropped,
+ continue
+ # Redefine the key if it's a column number and usecols is defined
+ if usecols:
+ try:
+ key = usecols.index(key)
+ except ValueError:
+ pass
+ # Add the value to the list
+ filling_values[key] = val
+ # We have a sequence : update on a one-to-one basis
+ elif isinstance(user_filling_values, (list, tuple)):
+ n = len(user_filling_values)
+ if (n <= nbcols):
+ filling_values[:n] = user_filling_values
else:
- val = [str(val), ]
- # Add the value(s) to the current list of missing
- if key is None:
- # None acts as default
- for miss in missing_values:
- miss.extend(val)
+ filling_values = user_filling_values[:nbcols]
+ # We have something else : use it for all entries
+ else:
+ filling_values = [user_filling_values] * nbcols
+
+ # Initialize the converters ................................
+ if dtype is None:
+ # Note: we can't use a [...]*nbcols, as we would have 3 times the same
+ # ... converter, instead of 3 different converters.
+ converters = [StringConverter(None, missing_values=miss, default=fill)
+ for (miss, fill) in zip(missing_values, filling_values)]
+ else:
+ dtype_flat = flatten_dtype(dtype, flatten_base=True)
+ # Initialize the converters
+ if len(dtype_flat) > 1:
+ # Flexible type : get a converter from each dtype
+ zipit = zip(dtype_flat, missing_values, filling_values)
+ converters = [StringConverter(dt, locked=True,
+ missing_values=miss, default=fill)
+ for (dt, miss, fill) in zipit]
else:
- missing_values[key].extend(val)
- # We have a sequence : each item matches a column
- elif isinstance(user_missing_values, (list, tuple)):
- for (value, entry) in zip(user_missing_values, missing_values):
- value = str(value)
- if value not in entry:
- entry.append(value)
- # We have a string : apply it to all entries
- elif isinstance(user_missing_values, bytes):
- user_value = user_missing_values.split(asbytes(","))
- for entry in missing_values:
- entry.extend(user_value)
- # We have something else: apply it to all entries
- else:
- for entry in missing_values:
- entry.extend([str(user_missing_values)])
-
- # Process the filling_values ...............................
- # Rename the input for convenience
- user_filling_values = filling_values
- if user_filling_values is None:
- user_filling_values = []
- # Define the default
- filling_values = [None] * nbcols
- # We have a dictionary : update each entry individually
- if isinstance(user_filling_values, dict):
- for (key, val) in user_filling_values.items():
- if _is_string_like(key):
+ # Set to a default converter (but w/ different missing values)
+ zipit = zip(missing_values, filling_values)
+ converters = [StringConverter(dtype, locked=True,
+ missing_values=miss, default=fill)
+ for (miss, fill) in zipit]
+ # Update the converters to use the user-defined ones
+ uc_update = []
+ for (j, conv) in user_converters.items():
+ # If the converter is specified by column names, use the index instead
+ if _is_string_like(j):
try:
- # Transform it into an integer
- key = names.index(key)
+ j = names.index(j)
+ i = j
except ValueError:
- # We couldn't find it: the name must have been dropped,
continue
- # Redefine the key if it's a column number and usecols is defined
- if usecols:
+ elif usecols:
try:
- key = usecols.index(key)
+ i = usecols.index(j)
except ValueError:
- pass
- # Add the value to the list
- filling_values[key] = val
- # We have a sequence : update on a one-to-one basis
- elif isinstance(user_filling_values, (list, tuple)):
- n = len(user_filling_values)
- if (n <= nbcols):
- filling_values[:n] = user_filling_values
- else:
- filling_values = user_filling_values[:nbcols]
- # We have something else : use it for all entries
- else:
- filling_values = [user_filling_values] * nbcols
-
- # Initialize the converters ................................
- if dtype is None:
- # Note: we can't use a [...]*nbcols, as we would have 3 times the same
- # ... converter, instead of 3 different converters.
- converters = [StringConverter(None, missing_values=miss, default=fill)
- for (miss, fill) in zip(missing_values, filling_values)]
- else:
- dtype_flat = flatten_dtype(dtype, flatten_base=True)
- # Initialize the converters
- if len(dtype_flat) > 1:
- # Flexible type : get a converter from each dtype
- zipit = zip(dtype_flat, missing_values, filling_values)
- converters = [StringConverter(dt, locked=True,
- missing_values=miss, default=fill)
- for (dt, miss, fill) in zipit]
- else:
- # Set to a default converter (but w/ different missing values)
- zipit = zip(missing_values, filling_values)
- converters = [StringConverter(dtype, locked=True,
- missing_values=miss, default=fill)
- for (miss, fill) in zipit]
- # Update the converters to use the user-defined ones
- uc_update = []
- for (j, conv) in user_converters.items():
- # If the converter is specified by column names, use the index instead
- if _is_string_like(j):
- try:
- j = names.index(j)
+ # Unused converter specified
+ continue
+ else:
i = j
- except ValueError:
- continue
- elif usecols:
- try:
- i = usecols.index(j)
- except ValueError:
- # Unused converter specified
+ # Find the value to test - first_line is not filtered by usecols:
+ if len(first_line):
+ testing_value = first_values[j]
+ else:
+ testing_value = None
+ if conv is bytes:
+ user_conv = asbytes
+ elif byte_converters:
+ # converters may use decode to workaround numpy's old behaviour,
+ # so encode the string again before passing to the user converter
+ def tobytes_first(x, conv):
+ if type(x) is bytes:
+ return conv(x)
+ return conv(x.encode("latin1"))
+ user_conv = functools.partial(tobytes_first, conv=conv)
+ else:
+ user_conv = conv
+ converters[i].update(user_conv, locked=True,
+ testing_value=testing_value,
+ default=filling_values[i],
+ missing_values=missing_values[i],)
+ uc_update.append((i, user_conv))
+ # Make sure we have the corrected keys in user_converters...
+ user_converters.update(uc_update)
+
+ # Fixme: possible error as following variable never used.
+ # miss_chars = [_.missing_values for _ in converters]
+
+ # Initialize the output lists ...
+ # ... rows
+ rows = []
+ append_to_rows = rows.append
+ # ... masks
+ if usemask:
+ masks = []
+ append_to_masks = masks.append
+ # ... invalid
+ invalid = []
+ append_to_invalid = invalid.append
+
+ # Parse each line
+ for (i, line) in enumerate(itertools.chain([first_line, ], fhd)):
+ values = split_line(line)
+ nbvalues = len(values)
+ # Skip an empty line
+ if nbvalues == 0:
continue
- else:
- i = j
- # Find the value to test - first_line is not filtered by usecols:
- if len(first_line):
- testing_value = first_values[j]
- else:
- testing_value = None
- converters[i].update(conv, locked=True,
- testing_value=testing_value,
- default=filling_values[i],
- missing_values=missing_values[i],)
- uc_update.append((i, conv))
- # Make sure we have the corrected keys in user_converters...
- user_converters.update(uc_update)
-
- # Fixme: possible error as following variable never used.
- #miss_chars = [_.missing_values for _ in converters]
-
- # Initialize the output lists ...
- # ... rows
- rows = []
- append_to_rows = rows.append
- # ... masks
- if usemask:
- masks = []
- append_to_masks = masks.append
- # ... invalid
- invalid = []
- append_to_invalid = invalid.append
-
- # Parse each line
- for (i, line) in enumerate(itertools.chain([first_line, ], fhd)):
- values = split_line(line)
- nbvalues = len(values)
- # Skip an empty line
- if nbvalues == 0:
- continue
- if usecols:
- # Select only the columns we need
- try:
- values = [values[_] for _ in usecols]
- except IndexError:
+ if usecols:
+ # Select only the columns we need
+ try:
+ values = [values[_] for _ in usecols]
+ except IndexError:
+ append_to_invalid((i + skip_header + 1, nbvalues))
+ continue
+ elif nbvalues != nbcols:
append_to_invalid((i + skip_header + 1, nbvalues))
continue
- elif nbvalues != nbcols:
- append_to_invalid((i + skip_header + 1, nbvalues))
- continue
- # Store the values
- append_to_rows(tuple(values))
- if usemask:
- append_to_masks(tuple([v.strip() in m
- for (v, m) in zip(values,
- missing_values)]))
- if len(rows) == max_rows:
- break
-
- if own_fhd:
- fhd.close()
+ # Store the values
+ append_to_rows(tuple(values))
+ if usemask:
+ append_to_masks(tuple([v.strip() in m
+ for (v, m) in zip(values,
+ missing_values)]))
+ if len(rows) == max_rows:
+ break
# Upgrade the converters (if needed)
if dtype is None:
@@ -1892,25 +2097,54 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
column_types = [conv.type for conv in converters]
# Find the columns with strings...
strcolidx = [i for (i, v) in enumerate(column_types)
- if v in (type('S'), np.string_)]
- # ... and take the largest number of chars.
- for i in strcolidx:
- column_types[i] = "|S%i" % max(len(row[i]) for row in data)
- #
+ if v == np.unicode_]
+
+ if byte_converters and strcolidx:
+ # convert strings back to bytes for backward compatibility
+ warnings.warn(
+ "Reading unicode strings without specifying the encoding "
+ "argument is deprecated. Set the encoding, use None for the "
+ "system default.",
+ np.VisibleDeprecationWarning, stacklevel=2)
+ def encode_unicode_cols(row_tup):
+ row = list(row_tup)
+ for i in strcolidx:
+ row[i] = row[i].encode('latin1')
+ return tuple(row)
+
+ try:
+ data = [encode_unicode_cols(r) for r in data]
+ except UnicodeEncodeError:
+ pass
+ else:
+ for i in strcolidx:
+ column_types[i] = np.bytes_
+
+ # Update string types to be the right length
+ sized_column_types = column_types[:]
+ for i, col_type in enumerate(column_types):
+ if np.issubdtype(col_type, np.character):
+ n_chars = max(len(row[i]) for row in data)
+ sized_column_types[i] = (col_type, n_chars)
+
if names is None:
- # If the dtype is uniform, don't define names, else use ''
- base = set([c.type for c in converters if c._checked])
+ # If the dtype is uniform (before sizing strings)
+ base = {
+ c_type
+ for c, c_type in zip(converters, column_types)
+ if c._checked}
if len(base) == 1:
- (ddtype, mdtype) = (list(base)[0], np.bool)
+ uniform_type, = base
+ (ddtype, mdtype) = (uniform_type, bool)
else:
ddtype = [(defaultfmt % i, dt)
- for (i, dt) in enumerate(column_types)]
+ for (i, dt) in enumerate(sized_column_types)]
if usemask:
- mdtype = [(defaultfmt % i, np.bool)
- for (i, dt) in enumerate(column_types)]
+ mdtype = [(defaultfmt % i, bool)
+ for (i, dt) in enumerate(sized_column_types)]
else:
- ddtype = list(zip(names, column_types))
- mdtype = list(zip(names, [np.bool] * len(column_types)))
+ ddtype = list(zip(names, sized_column_types))
+ mdtype = list(zip(names, [bool] * len(sized_column_types)))
output = np.array(data, dtype=ddtype)
if usemask:
outputmask = np.array(masks, dtype=mdtype)
@@ -1936,7 +2170,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
# Now, process the rowmasks the same way
if usemask:
rowmasks = np.array(
- masks, dtype=np.dtype([('', np.bool) for t in dtype_flat]))
+ masks, dtype=np.dtype([('', bool) for t in dtype_flat]))
# Construct the new dtype
mdtype = make_mask_descr(dtype)
outputmask = rowmasks.view(mdtype)
@@ -1950,8 +2184,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
# Keep the dtype of the current converter
if i in user_converters:
ishomogeneous &= (ttype == dtype.type)
- if ttype == np.string_:
- ttype = "|S%i" % max(len(row[i]) for row in data)
+ if np.issubdtype(ttype, np.character):
+ ttype = (ttype, max(len(row[i]) for row in data))
descr.append(('', ttype))
else:
descr.append(('', dtype))
@@ -1967,16 +2201,16 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None,
output = np.array(data, dtype)
if usemask:
if dtype.names:
- mdtype = [(_, np.bool) for _ in dtype.names]
+ mdtype = [(_, bool) for _ in dtype.names]
else:
- mdtype = np.bool
+ mdtype = bool
outputmask = np.array(masks, dtype=mdtype)
# Try to take care of the missing data we missed
names = output.dtype.names
if usemask and names:
- for (name, conv) in zip(names or (), converters):
+ for (name, conv) in zip(names, converters):
missing_values = [conv(_) for _ in conv.missing_values
- if _ != asbytes('')]
+ if _ != '']
for mval in missing_values:
outputmask[name] |= (output[name] == mval)
# Construct the final array