summaryrefslogtreecommitdiff
path: root/numpy/lib/npyio.py
diff options
context:
space:
mode:
authorSebastian Berg <sebastian@sipsolutions.net>2021-10-20 10:17:39 -0500
committerSebastian Berg <sebastian@sipsolutions.net>2022-01-14 20:04:29 -0600
commit1e15b43ae15af14127f62f05c84d69393881774a (patch)
treeea8f47072497ec085b4c045c88feaa4925668b83 /numpy/lib/npyio.py
parente2d35064df262efa6eb7dfe5bfc43160c73cf685 (diff)
downloadnumpy-1e15b43ae15af14127f62f05c84d69393881774a.tar.gz
ENH: Move npreadtext into NumPy for faster text reading
This replaces `np.loadtxt` with the new textreader. The file has a few minor cleanups compared to the npreadtext version. npreadtext was started by Warren Weckesser for inclusion in NumPy and then very heavily modified by me (Sebastian Berg) to improve it and slim it down slightly. Some parts of this code is inspired or even taken from the pandas parser (mainly the integer parsers are fairly verbatim still). Co-authored-by: Warren Weckesser <warren.weckesser@gmail.com>
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r--numpy/lib/npyio.py634
1 files changed, 323 insertions, 311 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index a6c2d4c2d..c2472f601 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -5,6 +5,7 @@ import itertools
import warnings
import weakref
import contextlib
+import operator
from operator import itemgetter, index as opindex, methodcaller
from collections.abc import Mapping
@@ -13,6 +14,7 @@ from . import format
from ._datasource import DataSource
from numpy.core import overrides
from numpy.core.multiarray import packbits, unpackbits
+from numpy.core._multiarray_umath import _load_from_filelike
from numpy.core.overrides import set_array_function_like_doc, set_module
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
@@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf.close()
-def _floatconv(x):
- try:
- return float(x) # The fastest path.
- except ValueError:
- if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
- try:
- return float.fromhex(x)
- except ValueError:
- pass
- raise # Raise the original exception, which makes more sense.
-
-
-_CONVERTERS = [ # These converters only ever get strs (not bytes) as input.
- (np.bool_, lambda x: bool(int(x))),
- (np.uint64, np.uint64),
- (np.int64, np.int64),
- (np.integer, lambda x: int(float(x))),
- (np.longdouble, np.longdouble),
- (np.floating, _floatconv),
- (complex, lambda x: complex(x.replace('+-', '-'))),
- (np.bytes_, methodcaller('encode', 'latin-1')),
- (np.unicode_, str),
-]
-
-
-def _getconv(dtype):
- """
- Find the correct dtype converter. Adapted from matplotlib.
-
- Even when a lambda is returned, it is defined at the toplevel, to allow
- testing for equality and enabling optimization for single-type data.
- """
- for base, conv in _CONVERTERS:
- if issubclass(dtype.type, base):
- return conv
- return str
-
-
-# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers
-# lifted to the toplevel because recursive inner functions cause either
-# GC-dependent reference loops (because they are closures over loadtxt's
-# internal variables) or large overheads if using a manual trampoline to hide
-# the recursive calls.
-
-
-# not to be confused with the flatten_dtype we import...
-def _loadtxt_flatten_dtype_internal(dt):
- """Unpack a structured data-type, and produce a packer function."""
- if dt.names is None:
- # If the dtype is flattened, return.
- # If the dtype has a shape, the dtype occurs
- # in the list more than once.
- shape = dt.shape
- if len(shape) == 0:
- return ([dt.base], None)
- else:
- packing = [(shape[-1], list)]
- if len(shape) > 1:
- for dim in dt.shape[-2::-1]:
- packing = [(dim*packing[0][0], packing*dim)]
- return ([dt.base] * int(np.prod(dt.shape)),
- functools.partial(_loadtxt_pack_items, packing))
- else:
- types = []
- packing = []
- for field in dt.names:
- tp, bytes = dt.fields[field]
- flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp)
- types.extend(flat_dt)
- flat_packing = flat_packer.args[0] if flat_packer else None
- # Avoid extra nesting for subarrays
- if tp.ndim > 0:
- packing.extend(flat_packing)
- else:
- packing.append((len(flat_dt), flat_packing))
- return (types, functools.partial(_loadtxt_pack_items, packing))
-
-
-def _loadtxt_pack_items(packing, items):
- """Pack items into nested lists based on re-packing info."""
- if packing is None:
- return items[0]
- elif packing is tuple:
- return tuple(items)
- elif packing is list:
- return list(items)
- else:
- start = 0
- ret = []
- for length, subpacking in packing:
- ret.append(
- _loadtxt_pack_items(subpacking, items[start:start+length]))
- start += length
- return tuple(ret)
-
def _ensure_ndmin_ndarray_check_param(ndmin):
"""Just checks if the param ndmin is supported on
_ensure_ndmin_ndarray. Is intented to be used as
@@ -859,6 +766,310 @@ def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
return (like,)
+def _check_nonneg_int(value, name="argument"):
+ try:
+ operator.index(value)
+ except TypeError:
+ raise TypeError(f"{name} must be an integer") from None
+ if value < 0:
+ raise ValueError(f"{name} must be nonnegative")
+
+
+def _preprocess_comments(iterable, comments, encoding):
+ """
+ Generator that consumes a line iterated iterable and strips out the
+ multiple (or multi-character) comments from lines.
+ This is a pre-processing step to achieve feature parity with loadtxt
+ (we assume that this feature is a nieche feature).
+ """
+ for line in iterable:
+ if isinstance(line, bytes):
+ # Need to handle conversion here, or the splitting would fail
+ line = line.decode(encoding)
+
+ for c in comments:
+ line = line.split(c, 1)[0]
+
+ yield line
+
+
+# The number of rows we read in one go if confronted with a parametric dtype
+_loadtxt_chunksize = 50000
+
+
+def _read(fname, *, delimiter=',', comment='#', quote='"',
+ imaginary_unit='j', usecols=None, skiprows=0,
+ max_rows=None, converters=None, ndmin=None, unpack=False,
+ dtype=np.float64, encoding="bytes"):
+ r"""
+ Read a NumPy array from a text file.
+
+ Parameters
+ ----------
+ fname : str or file object
+ The filename or the file to be read.
+ delimiter : str, optional
+ Field delimiter of the fields in line of the file.
+ Default is a comma, ','.
+ comment : str or sequence of str, optional
+ Character that begins a comment. All text from the comment
+ character to the end of the line is ignored.
+ Multiple comments or multiple-character comment strings are supported,
+ but may be slower and `quote` must be empty if used.
+ quote : str, optional
+ Character that is used to quote string fields. Default is '"'
+ (a double quote).
+ imaginary_unit : str, optional
+ Character that represent the imaginay unit `sqrt(-1)`.
+ Default is 'j'.
+ usecols : array_like, optional
+ A one-dimensional array of integer column numbers. These are the
+ columns from the file to be included in the array. If this value
+ is not given, all the columns are used.
+ skiprows : int, optional
+ Number of lines to skip before interpreting the data in the file.
+ max_rows : int, optional
+ Maximum number of rows of data to read. Default is to read the
+ entire file.
+ converters : dict, optional
+ A dictionary mapping column number to a function that will parse the
+ column string into the desired value. E.g. if column 0 is a date
+ string: ``converters = {0: datestr2num}``. Converters can also be used
+ to provide a default value for missing data, e.g.
+ ``converters = {3: lambda s: float(s.strip() or 0)}``.
+ Default: None
+ ndmin : int, optional
+ Minimum dimension of the array returned.
+ Allowed values are 0, 1 or 2. Default is 0.
+ unpack : bool, optional
+ If True, the returned array is transposed, so that arguments may be
+ unpacked using ``x, y, z = read(...)``. When used with a structured
+ data-type, arrays are returned for each field. Default is False.
+ dtype : numpy data type
+ A NumPy dtype instance, can be a structured dtype to map to the
+ columns of the file.
+ encoding : str, optional
+ Encoding used to decode the inputfile. The special value 'bytes'
+ (the default) enables backwards-compatible behavior for `converters`,
+ ensuring that inputs to the converter functions are encoded
+ bytes objects. The special value 'bytes' has no additional effect if
+ ``converters=None``. If encoding is ``'bytes'`` or ``None``, the
+ default system encoding is used.
+
+ Returns
+ -------
+ ndarray
+ NumPy array.
+
+ Examples
+ --------
+ First we create a file for the example.
+
+ >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n'
+ >>> with open('example1.csv', 'w') as f:
+ ... f.write(s1)
+ >>> a1 = read_from_filename('example1.csv')
+ >>> a1
+ array([[1., 2., 3.],
+ [4., 5., 6.]])
+
+ The second example has columns with different data types, so a
+ one-dimensional array with a structured data type is returned.
+ The tab character is used as the field delimiter.
+
+ >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n'
+ >>> with open('example2.tsv', 'w') as f:
+ ... f.write(s2)
+ >>> a2 = read_from_filename('example2.tsv', delimiter='\t')
+ >>> a2
+ array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')],
+ dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')])
+ """
+ # Handle special 'bytes' keyword for encoding
+ byte_converters = False
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+
+ if dtype is None:
+ raise TypeError("a dtype must be provided.")
+ dtype = np.dtype(dtype)
+
+ read_dtype_via_object_chunks = None
+ if dtype.kind in 'SUM' and (
+ dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'):
+ # This is a legacy "flexible" dtype. We do not truly support
+ # parametric dtypes currently (no dtype discovery step in the core),
+ # but have to support these for backward compatibility.
+ read_dtype_via_object_chunks = dtype
+ dtype = np.dtype(object)
+
+ if usecols is not None:
+ # Allow usecols to be a single int or a sequence of ints
+ try:
+ usecols_as_list = list(usecols)
+ except TypeError:
+ usecols_as_list = [usecols]
+ for col_idx in usecols_as_list:
+ try:
+ operator.index(col_idx)
+ except TypeError:
+ # Some unit tests for numpy.loadtxt require that the
+ # error message matches this format.
+ raise TypeError(
+ "usecols must be an int or a sequence of ints but "
+ "it contains at least one element of type %s" %
+ type(col_idx),
+ ) from None
+ # Fall back to existing code
+ usecols = np.array([operator.index(i) for i in usecols_as_list],
+ dtype=np.int32)
+
+ _ensure_ndmin_ndarray_check_param(ndmin)
+
+ if not isinstance(comment, str):
+ # assume comments are a sequence of strings
+ comments = tuple(comment)
+ comment = ''
+ # If there is only one comment, and that comment has one character,
+ # the normal parsing can deal with it just fine.
+ if len(comments) == 1:
+ if isinstance(comments[0], str) and len(comments[0]) == 1:
+ comment = comments[0]
+ comments = None
+ elif len(comment) > 1:
+ comments = (comment,)
+ comment = ''
+ else:
+ comments = None
+
+ # comment is now either a 1 or 0 character string or a tuple:
+ if comments is not None:
+ assert comment == ''
+ # Note: An earlier version support two character comments (and could
+ # have been extended to multiple characters, we assume this is
+ # rare enough to not optimize for.
+ if quote != "":
+ raise ValueError(
+ "when multiple comments or a multi-character comment is given, "
+ "quotes are not supported. In this case the quote character "
+ "must be set to the empty string: `quote=''`.")
+ else:
+ # No preprocessing necessary
+ assert comments is None
+
+ if len(imaginary_unit) != 1:
+ raise ValueError('len(imaginary_unit) must be 1.')
+
+ _check_nonneg_int(skiprows)
+ if max_rows is not None:
+ _check_nonneg_int(max_rows)
+ else:
+ # Passing -1 to the C code means "read the entire file".
+ max_rows = -1
+
+ fh_closing_ctx = contextlib.nullcontext()
+ filelike = False
+ try:
+ if isinstance(fname, os.PathLike):
+ fname = os.fspath(fname)
+ # TODO: loadtxt actually uses `file + ''` to decide this?!
+ if isinstance(fname, str):
+ fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ if encoding is None:
+ encoding = getattr(fh, 'encoding', 'latin1')
+
+ fh_closing_ctx = contextlib.closing(fh)
+ data = fh
+ filelike = True
+ else:
+ if encoding is None:
+ encoding = getattr(fname, 'encoding', 'latin1')
+ data = iter(fname)
+ except TypeError as e:
+ raise ValueError(
+ f"fname must be a string, filehandle, list of strings,\n"
+ f"or generator. Got {type(fname)} instead.") from e
+
+ with fh_closing_ctx:
+ if comments is not None:
+ if filelike:
+ data = iter(data)
+ filelike = False
+ data = _preprocess_comments(data, comments, encoding)
+
+ if read_dtype_via_object_chunks is None:
+ arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiprows=skiprows, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters)
+
+ else:
+ # This branch reads the file into chunks of object arrays and then
+ # casts them to the desired actual dtype. This ensures correct
+ # string-length and datetime-unit discovery (as for `arr.astype()`).
+ # Due to chunking, certain error reports are less clear, currently.
+ if filelike:
+ data = iter(data) # cannot chunk when reading from file
+
+ c_byte_converters = False
+ if read_dtype_via_object_chunks == "S":
+ c_byte_converters = True # Use latin1 rather than ascii
+
+ chunks = []
+ while max_rows != 0:
+ if max_rows < 0:
+ chunk_size = _loadtxt_chunksize
+ else:
+ chunk_size = min(_loadtxt_chunksize, max_rows)
+
+ next_arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiprows=skiprows, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters,
+ c_byte_converters=c_byte_converters)
+ # Cast here already. We hope that this is better even for
+ # large files because the storage is more compact. It could
+ # be adapted (in principle the concatenate could cast).
+ chunks.append(next_arr.astype(read_dtype_via_object_chunks))
+
+ skiprows = 0 # Only have to skip for first chunk
+ if max_rows >= 0:
+ max_rows -= chunk_size
+ if len(next_arr) < chunk_size:
+ # There was less data than requested, so we are done.
+ break
+
+ # Need at least one chunk, but if empty, the last one may have
+ # the wrong shape.
+ if len(chunks) > 1 and len(chunks[-1]) == 0:
+ del chunks[-1]
+ if len(chunks) == 1:
+ arr = chunks[0]
+ else:
+ arr = np.concatenate(chunks, axis=0)
+
+ arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin)
+
+ if unpack:
+ # Handle unpack like np.loadtxt.
+ # XXX Check interaction with ndmin!
+ dt = arr.dtype
+ if dt.names is not None:
+ # For structured arrays, return an array for each field.
+ return [arr[field] for field in dt.names]
+ else:
+ return arr.T
+ else:
+ return arr
+
+
@set_array_function_like_doc
@set_module('numpy')
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
@@ -1000,228 +1211,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
max_rows=max_rows, like=like
)
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Nested functions used by loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- def split_line(line: str):
- """Chop off comments, strip, and split at delimiter."""
- for comment in comments: # Much faster than using a single regex.
- line = line.split(comment, 1)[0]
- line = line.strip('\r\n')
- return line.split(delimiter) if line else []
+ if delimiter is None:
+ delimiter = ''
+ elif isinstance(delimiter, bytes):
+ delimiter.decode("latin1")
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Main body of loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- _ensure_ndmin_ndarray_check_param(ndmin)
+ if dtype is None:
+ dtype = np.float64
+ comment = comments
# Type conversions for Py3 convenience
- if comments is not None:
- if isinstance(comments, (str, bytes)):
- comments = [comments]
- comments = [_decode_line(x) for x in comments]
- else:
- comments = []
-
- if delimiter is not None:
- delimiter = _decode_line(delimiter)
-
- user_converters = converters
-
- byte_converters = False
- if encoding == 'bytes':
- encoding = None
- byte_converters = True
-
- if usecols is not None:
- # Copy usecols, allowing it to be a single int or a sequence of ints.
- try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols]
- for i, col_idx in enumerate(usecols):
- try:
- usecols[i] = opindex(col_idx) # Cast to builtin int now.
- except TypeError as e:
- e.args = (
- "usecols must be an int or a sequence of ints but "
- "it contains at least one element of type %s" %
- type(col_idx),
- )
- raise
- if len(usecols) > 1:
- usecols_getter = itemgetter(*usecols)
- else:
- # Get an iterable back, even if using a single column.
- usecols_getter = lambda obj, c=usecols[0]: [obj[c]]
+ if comment is None:
+ comment = ''
else:
- usecols_getter = None
-
- # Make sure we're dealing with a proper dtype
- dtype = np.dtype(dtype)
- defconv = _getconv(dtype)
-
- dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype)
-
- fh_closing_ctx = contextlib.nullcontext()
- try:
- if isinstance(fname, os_PathLike):
- fname = os_fspath(fname)
- if _is_string_like(fname):
- fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
- fencoding = getattr(fh, 'encoding', 'latin1')
- line_iter = iter(fh)
- fh_closing_ctx = contextlib.closing(fh)
- else:
- line_iter = iter(fname)
- fencoding = getattr(fname, 'encoding', 'latin1')
- try:
- first_line = next(line_iter)
- except StopIteration:
- pass # Nothing matters if line_iter is empty.
- else:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- if isinstance(first_line, bytes):
- # Using latin1 matches _decode_line's behavior.
- decoder = methodcaller(
- "decode",
- encoding if encoding is not None else "latin1")
- line_iter = map(decoder, line_iter)
- except TypeError as e:
- raise ValueError(
- f"fname must be a string, filehandle, list of strings,\n"
- f"or generator. Got {type(fname)} instead."
- ) from e
-
- with fh_closing_ctx:
-
- # input may be a python2 io stream
- if encoding is not None:
- fencoding = encoding
- # we must assume local encoding
- # TODO emit portability warning?
- elif fencoding is None:
- import locale
- fencoding = locale.getpreferredencoding()
-
- # Skip the first `skiprows` lines
- for i in range(skiprows):
- next(line_iter)
-
- # Read until we find a line with some values, and use it to determine
- # the need for decoding and estimate the number of columns.
- for first_line in line_iter:
- ncols = len(usecols or split_line(first_line))
- if ncols:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- break
- else: # End of lines reached
- ncols = len(usecols or [])
- warnings.warn('loadtxt: Empty input file: "%s"' % fname,
- stacklevel=2)
-
- line_iter = itertools.islice(line_iter, max_rows)
- lineno_words_iter = filter(
- itemgetter(1), # item[1] is words; filter skips empty lines.
- enumerate(map(split_line, line_iter), 1 + skiprows))
-
- # Now that we know ncols, create the default converters list, and
- # set packing, if necessary.
- if len(dtype_types) > 1:
- # We're dealing with a structured array, each field of
- # the dtype matches a column
- converters = [_getconv(dt) for dt in dtype_types]
- else:
- # All fields have the same dtype; use specialized packers which are
- # much faster than those using _loadtxt_pack_items.
- converters = [defconv for i in range(ncols)]
- if ncols == 1:
- packer = itemgetter(0)
- else:
- def packer(row): return row
-
- # By preference, use the converters specified by the user
- for i, conv in (user_converters or {}).items():
- if usecols:
- try:
- i = usecols.index(i)
- except ValueError:
- # Unused converter specified
- continue
- if byte_converters:
- # converters may use decode to workaround numpy's old
- # behaviour, so encode the string again (converters are only
- # called with strings) before passing to the user converter.
- def tobytes_first(conv, x):
- return conv(x.encode("latin1"))
- converters[i] = functools.partial(tobytes_first, conv)
- else:
- converters[i] = conv
-
- fencode = methodcaller("encode", fencoding)
- converters = [conv if conv is not bytes else fencode
- for conv in converters]
- if len(set(converters)) == 1:
- # Optimize single-type data. Note that this is only reached if
- # `_getconv` returns equal callables (i.e. not local lambdas) on
- # equal dtypes.
- def convert_row(vals, _conv=converters[0]):
- return [*map(_conv, vals)]
- else:
- def convert_row(vals):
- return [conv(val) for conv, val in zip(converters, vals)]
-
- # read data in chunks and fill it into an array via resize
- # over-allocating and shrinking the array later may be faster but is
- # probably not relevant compared to the cost of actually reading and
- # converting the data
- X = None
- while True:
- chunk = []
- for lineno, words in itertools.islice(
- lineno_words_iter, _loadtxt_chunksize):
- if usecols_getter is not None:
- words = usecols_getter(words)
- elif len(words) != ncols:
- raise ValueError(
- f"Wrong number of columns at line {lineno}")
- # Convert each value according to its column, then pack it
- # according to the dtype's nesting, and store it.
- chunk.append(packer(convert_row(words)))
- if not chunk: # The islice is empty, i.e. we're done.
- break
-
- if X is None:
- X = np.array(chunk, dtype)
- else:
- nshape = list(X.shape)
- pos = nshape[0]
- nshape[0] += len(chunk)
- X.resize(nshape, refcheck=False)
- X[pos:, ...] = chunk
-
- if X is None:
- X = np.array([], dtype)
+ if isinstance(comment, (str, bytes)):
+ comment = [comment]
+ comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment]
- # Multicolumn data are returned with shape (1, N, M), i.e.
- # (1, 1, M) for a single row - remove the singleton dimension there
- if X.ndim == 3 and X.shape[:2] == (1, 1):
- X.shape = (1, -1)
+ arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
+ converters=converters, skiprows=skiprows, usecols=usecols,
+ unpack=unpack, ndmin=ndmin, encoding=encoding,
+ max_rows=max_rows, quote='')
- X = _ensure_ndmin_ndarray(X, ndmin=ndmin)
-
- if unpack:
- if len(dtype_types) > 1:
- # For structured arrays, return an array for each field.
- return [X[field] for field in dtype.names]
- else:
- return X.T
- else:
- return X
+ return arr
_loadtxt_with_like = array_function_dispatch(