summaryrefslogtreecommitdiff
path: root/numpy/lib/npyio.py
diff options
context:
space:
mode:
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r--numpy/lib/npyio.py634
1 files changed, 323 insertions, 311 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
index a6c2d4c2d..c2472f601 100644
--- a/numpy/lib/npyio.py
+++ b/numpy/lib/npyio.py
@@ -5,6 +5,7 @@ import itertools
import warnings
import weakref
import contextlib
+import operator
from operator import itemgetter, index as opindex, methodcaller
from collections.abc import Mapping
@@ -13,6 +14,7 @@ from . import format
from ._datasource import DataSource
from numpy.core import overrides
from numpy.core.multiarray import packbits, unpackbits
+from numpy.core._multiarray_umath import _load_from_filelike
from numpy.core.overrides import set_array_function_like_doc, set_module
from ._iotools import (
LineSplitter, NameValidator, StringConverter, ConverterError,
@@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None):
zipf.close()
-def _floatconv(x):
- try:
- return float(x) # The fastest path.
- except ValueError:
- if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10.
- try:
- return float.fromhex(x)
- except ValueError:
- pass
- raise # Raise the original exception, which makes more sense.
-
-
-_CONVERTERS = [ # These converters only ever get strs (not bytes) as input.
- (np.bool_, lambda x: bool(int(x))),
- (np.uint64, np.uint64),
- (np.int64, np.int64),
- (np.integer, lambda x: int(float(x))),
- (np.longdouble, np.longdouble),
- (np.floating, _floatconv),
- (complex, lambda x: complex(x.replace('+-', '-'))),
- (np.bytes_, methodcaller('encode', 'latin-1')),
- (np.unicode_, str),
-]
-
-
-def _getconv(dtype):
- """
- Find the correct dtype converter. Adapted from matplotlib.
-
- Even when a lambda is returned, it is defined at the toplevel, to allow
- testing for equality and enabling optimization for single-type data.
- """
- for base, conv in _CONVERTERS:
- if issubclass(dtype.type, base):
- return conv
- return str
-
-
-# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers
-# lifted to the toplevel because recursive inner functions cause either
-# GC-dependent reference loops (because they are closures over loadtxt's
-# internal variables) or large overheads if using a manual trampoline to hide
-# the recursive calls.
-
-
-# not to be confused with the flatten_dtype we import...
-def _loadtxt_flatten_dtype_internal(dt):
- """Unpack a structured data-type, and produce a packer function."""
- if dt.names is None:
- # If the dtype is flattened, return.
- # If the dtype has a shape, the dtype occurs
- # in the list more than once.
- shape = dt.shape
- if len(shape) == 0:
- return ([dt.base], None)
- else:
- packing = [(shape[-1], list)]
- if len(shape) > 1:
- for dim in dt.shape[-2::-1]:
- packing = [(dim*packing[0][0], packing*dim)]
- return ([dt.base] * int(np.prod(dt.shape)),
- functools.partial(_loadtxt_pack_items, packing))
- else:
- types = []
- packing = []
- for field in dt.names:
- tp, bytes = dt.fields[field]
- flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp)
- types.extend(flat_dt)
- flat_packing = flat_packer.args[0] if flat_packer else None
- # Avoid extra nesting for subarrays
- if tp.ndim > 0:
- packing.extend(flat_packing)
- else:
- packing.append((len(flat_dt), flat_packing))
- return (types, functools.partial(_loadtxt_pack_items, packing))
-
-
-def _loadtxt_pack_items(packing, items):
- """Pack items into nested lists based on re-packing info."""
- if packing is None:
- return items[0]
- elif packing is tuple:
- return tuple(items)
- elif packing is list:
- return list(items)
- else:
- start = 0
- ret = []
- for length, subpacking in packing:
- ret.append(
- _loadtxt_pack_items(subpacking, items[start:start+length]))
- start += length
- return tuple(ret)
-
def _ensure_ndmin_ndarray_check_param(ndmin):
"""Just checks if the param ndmin is supported on
_ensure_ndmin_ndarray. Is intented to be used as
@@ -859,6 +766,310 @@ def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None,
return (like,)
+def _check_nonneg_int(value, name="argument"):
+ try:
+ operator.index(value)
+ except TypeError:
+ raise TypeError(f"{name} must be an integer") from None
+ if value < 0:
+ raise ValueError(f"{name} must be nonnegative")
+
+
+def _preprocess_comments(iterable, comments, encoding):
+ """
+ Generator that consumes a line iterated iterable and strips out the
+ multiple (or multi-character) comments from lines.
+ This is a pre-processing step to achieve feature parity with loadtxt
+ (we assume that this feature is a nieche feature).
+ """
+ for line in iterable:
+ if isinstance(line, bytes):
+ # Need to handle conversion here, or the splitting would fail
+ line = line.decode(encoding)
+
+ for c in comments:
+ line = line.split(c, 1)[0]
+
+ yield line
+
+
+# The number of rows we read in one go if confronted with a parametric dtype
+_loadtxt_chunksize = 50000
+
+
+def _read(fname, *, delimiter=',', comment='#', quote='"',
+ imaginary_unit='j', usecols=None, skiprows=0,
+ max_rows=None, converters=None, ndmin=None, unpack=False,
+ dtype=np.float64, encoding="bytes"):
+ r"""
+ Read a NumPy array from a text file.
+
+ Parameters
+ ----------
+ fname : str or file object
+ The filename or the file to be read.
+ delimiter : str, optional
+ Field delimiter of the fields in line of the file.
+ Default is a comma, ','.
+ comment : str or sequence of str, optional
+ Character that begins a comment. All text from the comment
+ character to the end of the line is ignored.
+ Multiple comments or multiple-character comment strings are supported,
+ but may be slower and `quote` must be empty if used.
+ quote : str, optional
+ Character that is used to quote string fields. Default is '"'
+ (a double quote).
+ imaginary_unit : str, optional
+ Character that represent the imaginay unit `sqrt(-1)`.
+ Default is 'j'.
+ usecols : array_like, optional
+ A one-dimensional array of integer column numbers. These are the
+ columns from the file to be included in the array. If this value
+ is not given, all the columns are used.
+ skiprows : int, optional
+ Number of lines to skip before interpreting the data in the file.
+ max_rows : int, optional
+ Maximum number of rows of data to read. Default is to read the
+ entire file.
+ converters : dict, optional
+ A dictionary mapping column number to a function that will parse the
+ column string into the desired value. E.g. if column 0 is a date
+ string: ``converters = {0: datestr2num}``. Converters can also be used
+ to provide a default value for missing data, e.g.
+ ``converters = {3: lambda s: float(s.strip() or 0)}``.
+ Default: None
+ ndmin : int, optional
+ Minimum dimension of the array returned.
+ Allowed values are 0, 1 or 2. Default is 0.
+ unpack : bool, optional
+ If True, the returned array is transposed, so that arguments may be
+ unpacked using ``x, y, z = read(...)``. When used with a structured
+ data-type, arrays are returned for each field. Default is False.
+ dtype : numpy data type
+ A NumPy dtype instance, can be a structured dtype to map to the
+ columns of the file.
+ encoding : str, optional
+ Encoding used to decode the inputfile. The special value 'bytes'
+ (the default) enables backwards-compatible behavior for `converters`,
+ ensuring that inputs to the converter functions are encoded
+ bytes objects. The special value 'bytes' has no additional effect if
+ ``converters=None``. If encoding is ``'bytes'`` or ``None``, the
+ default system encoding is used.
+
+ Returns
+ -------
+ ndarray
+ NumPy array.
+
+ Examples
+ --------
+ First we create a file for the example.
+
+ >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n'
+ >>> with open('example1.csv', 'w') as f:
+ ... f.write(s1)
+ >>> a1 = read_from_filename('example1.csv')
+ >>> a1
+ array([[1., 2., 3.],
+ [4., 5., 6.]])
+
+ The second example has columns with different data types, so a
+ one-dimensional array with a structured data type is returned.
+ The tab character is used as the field delimiter.
+
+ >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n'
+ >>> with open('example2.tsv', 'w') as f:
+ ... f.write(s2)
+ >>> a2 = read_from_filename('example2.tsv', delimiter='\t')
+ >>> a2
+ array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')],
+ dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')])
+ """
+ # Handle special 'bytes' keyword for encoding
+ byte_converters = False
+ if encoding == 'bytes':
+ encoding = None
+ byte_converters = True
+
+ if dtype is None:
+ raise TypeError("a dtype must be provided.")
+ dtype = np.dtype(dtype)
+
+ read_dtype_via_object_chunks = None
+ if dtype.kind in 'SUM' and (
+ dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'):
+ # This is a legacy "flexible" dtype. We do not truly support
+ # parametric dtypes currently (no dtype discovery step in the core),
+ # but have to support these for backward compatibility.
+ read_dtype_via_object_chunks = dtype
+ dtype = np.dtype(object)
+
+ if usecols is not None:
+ # Allow usecols to be a single int or a sequence of ints
+ try:
+ usecols_as_list = list(usecols)
+ except TypeError:
+ usecols_as_list = [usecols]
+ for col_idx in usecols_as_list:
+ try:
+ operator.index(col_idx)
+ except TypeError:
+ # Some unit tests for numpy.loadtxt require that the
+ # error message matches this format.
+ raise TypeError(
+ "usecols must be an int or a sequence of ints but "
+ "it contains at least one element of type %s" %
+ type(col_idx),
+ ) from None
+ # Fall back to existing code
+ usecols = np.array([operator.index(i) for i in usecols_as_list],
+ dtype=np.int32)
+
+ _ensure_ndmin_ndarray_check_param(ndmin)
+
+ if not isinstance(comment, str):
+ # assume comments are a sequence of strings
+ comments = tuple(comment)
+ comment = ''
+ # If there is only one comment, and that comment has one character,
+ # the normal parsing can deal with it just fine.
+ if len(comments) == 1:
+ if isinstance(comments[0], str) and len(comments[0]) == 1:
+ comment = comments[0]
+ comments = None
+ elif len(comment) > 1:
+ comments = (comment,)
+ comment = ''
+ else:
+ comments = None
+
+ # comment is now either a 1 or 0 character string or a tuple:
+ if comments is not None:
+ assert comment == ''
+ # Note: An earlier version support two character comments (and could
+ # have been extended to multiple characters, we assume this is
+ # rare enough to not optimize for.
+ if quote != "":
+ raise ValueError(
+ "when multiple comments or a multi-character comment is given, "
+ "quotes are not supported. In this case the quote character "
+ "must be set to the empty string: `quote=''`.")
+ else:
+ # No preprocessing necessary
+ assert comments is None
+
+ if len(imaginary_unit) != 1:
+ raise ValueError('len(imaginary_unit) must be 1.')
+
+ _check_nonneg_int(skiprows)
+ if max_rows is not None:
+ _check_nonneg_int(max_rows)
+ else:
+ # Passing -1 to the C code means "read the entire file".
+ max_rows = -1
+
+ fh_closing_ctx = contextlib.nullcontext()
+ filelike = False
+ try:
+ if isinstance(fname, os.PathLike):
+ fname = os.fspath(fname)
+ # TODO: loadtxt actually uses `file + ''` to decide this?!
+ if isinstance(fname, str):
+ fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
+ if encoding is None:
+ encoding = getattr(fh, 'encoding', 'latin1')
+
+ fh_closing_ctx = contextlib.closing(fh)
+ data = fh
+ filelike = True
+ else:
+ if encoding is None:
+ encoding = getattr(fname, 'encoding', 'latin1')
+ data = iter(fname)
+ except TypeError as e:
+ raise ValueError(
+ f"fname must be a string, filehandle, list of strings,\n"
+ f"or generator. Got {type(fname)} instead.") from e
+
+ with fh_closing_ctx:
+ if comments is not None:
+ if filelike:
+ data = iter(data)
+ filelike = False
+ data = _preprocess_comments(data, comments, encoding)
+
+ if read_dtype_via_object_chunks is None:
+ arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiprows=skiprows, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters)
+
+ else:
+ # This branch reads the file into chunks of object arrays and then
+ # casts them to the desired actual dtype. This ensures correct
+ # string-length and datetime-unit discovery (as for `arr.astype()`).
+ # Due to chunking, certain error reports are less clear, currently.
+ if filelike:
+ data = iter(data) # cannot chunk when reading from file
+
+ c_byte_converters = False
+ if read_dtype_via_object_chunks == "S":
+ c_byte_converters = True # Use latin1 rather than ascii
+
+ chunks = []
+ while max_rows != 0:
+ if max_rows < 0:
+ chunk_size = _loadtxt_chunksize
+ else:
+ chunk_size = min(_loadtxt_chunksize, max_rows)
+
+ next_arr = _load_from_filelike(
+ data, delimiter=delimiter, comment=comment, quote=quote,
+ imaginary_unit=imaginary_unit,
+ usecols=usecols, skiprows=skiprows, max_rows=max_rows,
+ converters=converters, dtype=dtype,
+ encoding=encoding, filelike=filelike,
+ byte_converters=byte_converters,
+ c_byte_converters=c_byte_converters)
+ # Cast here already. We hope that this is better even for
+ # large files because the storage is more compact. It could
+ # be adapted (in principle the concatenate could cast).
+ chunks.append(next_arr.astype(read_dtype_via_object_chunks))
+
+ skiprows = 0 # Only have to skip for first chunk
+ if max_rows >= 0:
+ max_rows -= chunk_size
+ if len(next_arr) < chunk_size:
+ # There was less data than requested, so we are done.
+ break
+
+ # Need at least one chunk, but if empty, the last one may have
+ # the wrong shape.
+ if len(chunks) > 1 and len(chunks[-1]) == 0:
+ del chunks[-1]
+ if len(chunks) == 1:
+ arr = chunks[0]
+ else:
+ arr = np.concatenate(chunks, axis=0)
+
+ arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin)
+
+ if unpack:
+ # Handle unpack like np.loadtxt.
+ # XXX Check interaction with ndmin!
+ dt = arr.dtype
+ if dt.names is not None:
+ # For structured arrays, return an array for each field.
+ return [arr[field] for field in dt.names]
+ else:
+ return arr.T
+ else:
+ return arr
+
+
@set_array_function_like_doc
@set_module('numpy')
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
@@ -1000,228 +1211,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None,
max_rows=max_rows, like=like
)
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Nested functions used by loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- def split_line(line: str):
- """Chop off comments, strip, and split at delimiter."""
- for comment in comments: # Much faster than using a single regex.
- line = line.split(comment, 1)[0]
- line = line.strip('\r\n')
- return line.split(delimiter) if line else []
+ if delimiter is None:
+ delimiter = ''
+ elif isinstance(delimiter, bytes):
+ delimiter.decode("latin1")
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- # Main body of loadtxt.
- # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
- _ensure_ndmin_ndarray_check_param(ndmin)
+ if dtype is None:
+ dtype = np.float64
+ comment = comments
# Type conversions for Py3 convenience
- if comments is not None:
- if isinstance(comments, (str, bytes)):
- comments = [comments]
- comments = [_decode_line(x) for x in comments]
- else:
- comments = []
-
- if delimiter is not None:
- delimiter = _decode_line(delimiter)
-
- user_converters = converters
-
- byte_converters = False
- if encoding == 'bytes':
- encoding = None
- byte_converters = True
-
- if usecols is not None:
- # Copy usecols, allowing it to be a single int or a sequence of ints.
- try:
- usecols = list(usecols)
- except TypeError:
- usecols = [usecols]
- for i, col_idx in enumerate(usecols):
- try:
- usecols[i] = opindex(col_idx) # Cast to builtin int now.
- except TypeError as e:
- e.args = (
- "usecols must be an int or a sequence of ints but "
- "it contains at least one element of type %s" %
- type(col_idx),
- )
- raise
- if len(usecols) > 1:
- usecols_getter = itemgetter(*usecols)
- else:
- # Get an iterable back, even if using a single column.
- usecols_getter = lambda obj, c=usecols[0]: [obj[c]]
+ if comment is None:
+ comment = ''
else:
- usecols_getter = None
-
- # Make sure we're dealing with a proper dtype
- dtype = np.dtype(dtype)
- defconv = _getconv(dtype)
-
- dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype)
-
- fh_closing_ctx = contextlib.nullcontext()
- try:
- if isinstance(fname, os_PathLike):
- fname = os_fspath(fname)
- if _is_string_like(fname):
- fh = np.lib._datasource.open(fname, 'rt', encoding=encoding)
- fencoding = getattr(fh, 'encoding', 'latin1')
- line_iter = iter(fh)
- fh_closing_ctx = contextlib.closing(fh)
- else:
- line_iter = iter(fname)
- fencoding = getattr(fname, 'encoding', 'latin1')
- try:
- first_line = next(line_iter)
- except StopIteration:
- pass # Nothing matters if line_iter is empty.
- else:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- if isinstance(first_line, bytes):
- # Using latin1 matches _decode_line's behavior.
- decoder = methodcaller(
- "decode",
- encoding if encoding is not None else "latin1")
- line_iter = map(decoder, line_iter)
- except TypeError as e:
- raise ValueError(
- f"fname must be a string, filehandle, list of strings,\n"
- f"or generator. Got {type(fname)} instead."
- ) from e
-
- with fh_closing_ctx:
-
- # input may be a python2 io stream
- if encoding is not None:
- fencoding = encoding
- # we must assume local encoding
- # TODO emit portability warning?
- elif fencoding is None:
- import locale
- fencoding = locale.getpreferredencoding()
-
- # Skip the first `skiprows` lines
- for i in range(skiprows):
- next(line_iter)
-
- # Read until we find a line with some values, and use it to determine
- # the need for decoding and estimate the number of columns.
- for first_line in line_iter:
- ncols = len(usecols or split_line(first_line))
- if ncols:
- # Put first_line back.
- line_iter = itertools.chain([first_line], line_iter)
- break
- else: # End of lines reached
- ncols = len(usecols or [])
- warnings.warn('loadtxt: Empty input file: "%s"' % fname,
- stacklevel=2)
-
- line_iter = itertools.islice(line_iter, max_rows)
- lineno_words_iter = filter(
- itemgetter(1), # item[1] is words; filter skips empty lines.
- enumerate(map(split_line, line_iter), 1 + skiprows))
-
- # Now that we know ncols, create the default converters list, and
- # set packing, if necessary.
- if len(dtype_types) > 1:
- # We're dealing with a structured array, each field of
- # the dtype matches a column
- converters = [_getconv(dt) for dt in dtype_types]
- else:
- # All fields have the same dtype; use specialized packers which are
- # much faster than those using _loadtxt_pack_items.
- converters = [defconv for i in range(ncols)]
- if ncols == 1:
- packer = itemgetter(0)
- else:
- def packer(row): return row
-
- # By preference, use the converters specified by the user
- for i, conv in (user_converters or {}).items():
- if usecols:
- try:
- i = usecols.index(i)
- except ValueError:
- # Unused converter specified
- continue
- if byte_converters:
- # converters may use decode to workaround numpy's old
- # behaviour, so encode the string again (converters are only
- # called with strings) before passing to the user converter.
- def tobytes_first(conv, x):
- return conv(x.encode("latin1"))
- converters[i] = functools.partial(tobytes_first, conv)
- else:
- converters[i] = conv
-
- fencode = methodcaller("encode", fencoding)
- converters = [conv if conv is not bytes else fencode
- for conv in converters]
- if len(set(converters)) == 1:
- # Optimize single-type data. Note that this is only reached if
- # `_getconv` returns equal callables (i.e. not local lambdas) on
- # equal dtypes.
- def convert_row(vals, _conv=converters[0]):
- return [*map(_conv, vals)]
- else:
- def convert_row(vals):
- return [conv(val) for conv, val in zip(converters, vals)]
-
- # read data in chunks and fill it into an array via resize
- # over-allocating and shrinking the array later may be faster but is
- # probably not relevant compared to the cost of actually reading and
- # converting the data
- X = None
- while True:
- chunk = []
- for lineno, words in itertools.islice(
- lineno_words_iter, _loadtxt_chunksize):
- if usecols_getter is not None:
- words = usecols_getter(words)
- elif len(words) != ncols:
- raise ValueError(
- f"Wrong number of columns at line {lineno}")
- # Convert each value according to its column, then pack it
- # according to the dtype's nesting, and store it.
- chunk.append(packer(convert_row(words)))
- if not chunk: # The islice is empty, i.e. we're done.
- break
-
- if X is None:
- X = np.array(chunk, dtype)
- else:
- nshape = list(X.shape)
- pos = nshape[0]
- nshape[0] += len(chunk)
- X.resize(nshape, refcheck=False)
- X[pos:, ...] = chunk
-
- if X is None:
- X = np.array([], dtype)
+ if isinstance(comment, (str, bytes)):
+ comment = [comment]
+ comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment]
- # Multicolumn data are returned with shape (1, N, M), i.e.
- # (1, 1, M) for a single row - remove the singleton dimension there
- if X.ndim == 3 and X.shape[:2] == (1, 1):
- X.shape = (1, -1)
+ arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter,
+ converters=converters, skiprows=skiprows, usecols=usecols,
+ unpack=unpack, ndmin=ndmin, encoding=encoding,
+ max_rows=max_rows, quote='')
- X = _ensure_ndmin_ndarray(X, ndmin=ndmin)
-
- if unpack:
- if len(dtype_types) > 1:
- # For structured arrays, return an array for each field.
- return [X[field] for field in dtype.names]
- else:
- return X.T
- else:
- return X
+ return arr
_loadtxt_with_like = array_function_dispatch(