diff options
Diffstat (limited to 'numpy/lib/npyio.py')
-rw-r--r-- | numpy/lib/npyio.py | 634 |
1 files changed, 323 insertions, 311 deletions
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index a6c2d4c2d..c2472f601 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -5,6 +5,7 @@ import itertools import warnings import weakref import contextlib +import operator from operator import itemgetter, index as opindex, methodcaller from collections.abc import Mapping @@ -13,6 +14,7 @@ from . import format from ._datasource import DataSource from numpy.core import overrides from numpy.core.multiarray import packbits, unpackbits +from numpy.core._multiarray_umath import _load_from_filelike from numpy.core.overrides import set_array_function_like_doc, set_module from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, @@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): zipf.close() -def _floatconv(x): - try: - return float(x) # The fastest path. - except ValueError: - if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10. - try: - return float.fromhex(x) - except ValueError: - pass - raise # Raise the original exception, which makes more sense. - - -_CONVERTERS = [ # These converters only ever get strs (not bytes) as input. - (np.bool_, lambda x: bool(int(x))), - (np.uint64, np.uint64), - (np.int64, np.int64), - (np.integer, lambda x: int(float(x))), - (np.longdouble, np.longdouble), - (np.floating, _floatconv), - (complex, lambda x: complex(x.replace('+-', '-'))), - (np.bytes_, methodcaller('encode', 'latin-1')), - (np.unicode_, str), -] - - -def _getconv(dtype): - """ - Find the correct dtype converter. Adapted from matplotlib. - - Even when a lambda is returned, it is defined at the toplevel, to allow - testing for equality and enabling optimization for single-type data. - """ - for base, conv in _CONVERTERS: - if issubclass(dtype.type, base): - return conv - return str - - -# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers -# lifted to the toplevel because recursive inner functions cause either -# GC-dependent reference loops (because they are closures over loadtxt's -# internal variables) or large overheads if using a manual trampoline to hide -# the recursive calls. - - -# not to be confused with the flatten_dtype we import... -def _loadtxt_flatten_dtype_internal(dt): - """Unpack a structured data-type, and produce a packer function.""" - if dt.names is None: - # If the dtype is flattened, return. - # If the dtype has a shape, the dtype occurs - # in the list more than once. - shape = dt.shape - if len(shape) == 0: - return ([dt.base], None) - else: - packing = [(shape[-1], list)] - if len(shape) > 1: - for dim in dt.shape[-2::-1]: - packing = [(dim*packing[0][0], packing*dim)] - return ([dt.base] * int(np.prod(dt.shape)), - functools.partial(_loadtxt_pack_items, packing)) - else: - types = [] - packing = [] - for field in dt.names: - tp, bytes = dt.fields[field] - flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp) - types.extend(flat_dt) - flat_packing = flat_packer.args[0] if flat_packer else None - # Avoid extra nesting for subarrays - if tp.ndim > 0: - packing.extend(flat_packing) - else: - packing.append((len(flat_dt), flat_packing)) - return (types, functools.partial(_loadtxt_pack_items, packing)) - - -def _loadtxt_pack_items(packing, items): - """Pack items into nested lists based on re-packing info.""" - if packing is None: - return items[0] - elif packing is tuple: - return tuple(items) - elif packing is list: - return list(items) - else: - start = 0 - ret = [] - for length, subpacking in packing: - ret.append( - _loadtxt_pack_items(subpacking, items[start:start+length])) - start += length - return tuple(ret) - def _ensure_ndmin_ndarray_check_param(ndmin): """Just checks if the param ndmin is supported on _ensure_ndmin_ndarray. Is intented to be used as @@ -859,6 +766,310 @@ def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None, return (like,) +def _check_nonneg_int(value, name="argument"): + try: + operator.index(value) + except TypeError: + raise TypeError(f"{name} must be an integer") from None + if value < 0: + raise ValueError(f"{name} must be nonnegative") + + +def _preprocess_comments(iterable, comments, encoding): + """ + Generator that consumes a line iterated iterable and strips out the + multiple (or multi-character) comments from lines. + This is a pre-processing step to achieve feature parity with loadtxt + (we assume that this feature is a nieche feature). + """ + for line in iterable: + if isinstance(line, bytes): + # Need to handle conversion here, or the splitting would fail + line = line.decode(encoding) + + for c in comments: + line = line.split(c, 1)[0] + + yield line + + +# The number of rows we read in one go if confronted with a parametric dtype +_loadtxt_chunksize = 50000 + + +def _read(fname, *, delimiter=',', comment='#', quote='"', + imaginary_unit='j', usecols=None, skiprows=0, + max_rows=None, converters=None, ndmin=None, unpack=False, + dtype=np.float64, encoding="bytes"): + r""" + Read a NumPy array from a text file. + + Parameters + ---------- + fname : str or file object + The filename or the file to be read. + delimiter : str, optional + Field delimiter of the fields in line of the file. + Default is a comma, ','. + comment : str or sequence of str, optional + Character that begins a comment. All text from the comment + character to the end of the line is ignored. + Multiple comments or multiple-character comment strings are supported, + but may be slower and `quote` must be empty if used. + quote : str, optional + Character that is used to quote string fields. Default is '"' + (a double quote). + imaginary_unit : str, optional + Character that represent the imaginay unit `sqrt(-1)`. + Default is 'j'. + usecols : array_like, optional + A one-dimensional array of integer column numbers. These are the + columns from the file to be included in the array. If this value + is not given, all the columns are used. + skiprows : int, optional + Number of lines to skip before interpreting the data in the file. + max_rows : int, optional + Maximum number of rows of data to read. Default is to read the + entire file. + converters : dict, optional + A dictionary mapping column number to a function that will parse the + column string into the desired value. E.g. if column 0 is a date + string: ``converters = {0: datestr2num}``. Converters can also be used + to provide a default value for missing data, e.g. + ``converters = {3: lambda s: float(s.strip() or 0)}``. + Default: None + ndmin : int, optional + Minimum dimension of the array returned. + Allowed values are 0, 1 or 2. Default is 0. + unpack : bool, optional + If True, the returned array is transposed, so that arguments may be + unpacked using ``x, y, z = read(...)``. When used with a structured + data-type, arrays are returned for each field. Default is False. + dtype : numpy data type + A NumPy dtype instance, can be a structured dtype to map to the + columns of the file. + encoding : str, optional + Encoding used to decode the inputfile. The special value 'bytes' + (the default) enables backwards-compatible behavior for `converters`, + ensuring that inputs to the converter functions are encoded + bytes objects. The special value 'bytes' has no additional effect if + ``converters=None``. If encoding is ``'bytes'`` or ``None``, the + default system encoding is used. + + Returns + ------- + ndarray + NumPy array. + + Examples + -------- + First we create a file for the example. + + >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n' + >>> with open('example1.csv', 'w') as f: + ... f.write(s1) + >>> a1 = read_from_filename('example1.csv') + >>> a1 + array([[1., 2., 3.], + [4., 5., 6.]]) + + The second example has columns with different data types, so a + one-dimensional array with a structured data type is returned. + The tab character is used as the field delimiter. + + >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n' + >>> with open('example2.tsv', 'w') as f: + ... f.write(s2) + >>> a2 = read_from_filename('example2.tsv', delimiter='\t') + >>> a2 + array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')], + dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')]) + """ + # Handle special 'bytes' keyword for encoding + byte_converters = False + if encoding == 'bytes': + encoding = None + byte_converters = True + + if dtype is None: + raise TypeError("a dtype must be provided.") + dtype = np.dtype(dtype) + + read_dtype_via_object_chunks = None + if dtype.kind in 'SUM' and ( + dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): + # This is a legacy "flexible" dtype. We do not truly support + # parametric dtypes currently (no dtype discovery step in the core), + # but have to support these for backward compatibility. + read_dtype_via_object_chunks = dtype + dtype = np.dtype(object) + + if usecols is not None: + # Allow usecols to be a single int or a sequence of ints + try: + usecols_as_list = list(usecols) + except TypeError: + usecols_as_list = [usecols] + for col_idx in usecols_as_list: + try: + operator.index(col_idx) + except TypeError: + # Some unit tests for numpy.loadtxt require that the + # error message matches this format. + raise TypeError( + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type %s" % + type(col_idx), + ) from None + # Fall back to existing code + usecols = np.array([operator.index(i) for i in usecols_as_list], + dtype=np.int32) + + _ensure_ndmin_ndarray_check_param(ndmin) + + if not isinstance(comment, str): + # assume comments are a sequence of strings + comments = tuple(comment) + comment = '' + # If there is only one comment, and that comment has one character, + # the normal parsing can deal with it just fine. + if len(comments) == 1: + if isinstance(comments[0], str) and len(comments[0]) == 1: + comment = comments[0] + comments = None + elif len(comment) > 1: + comments = (comment,) + comment = '' + else: + comments = None + + # comment is now either a 1 or 0 character string or a tuple: + if comments is not None: + assert comment == '' + # Note: An earlier version support two character comments (and could + # have been extended to multiple characters, we assume this is + # rare enough to not optimize for. + if quote != "": + raise ValueError( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported. In this case the quote character " + "must be set to the empty string: `quote=''`.") + else: + # No preprocessing necessary + assert comments is None + + if len(imaginary_unit) != 1: + raise ValueError('len(imaginary_unit) must be 1.') + + _check_nonneg_int(skiprows) + if max_rows is not None: + _check_nonneg_int(max_rows) + else: + # Passing -1 to the C code means "read the entire file". + max_rows = -1 + + fh_closing_ctx = contextlib.nullcontext() + filelike = False + try: + if isinstance(fname, os.PathLike): + fname = os.fspath(fname) + # TODO: loadtxt actually uses `file + ''` to decide this?! + if isinstance(fname, str): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + if encoding is None: + encoding = getattr(fh, 'encoding', 'latin1') + + fh_closing_ctx = contextlib.closing(fh) + data = fh + filelike = True + else: + if encoding is None: + encoding = getattr(fname, 'encoding', 'latin1') + data = iter(fname) + except TypeError as e: + raise ValueError( + f"fname must be a string, filehandle, list of strings,\n" + f"or generator. Got {type(fname)} instead.") from e + + with fh_closing_ctx: + if comments is not None: + if filelike: + data = iter(data) + filelike = False + data = _preprocess_comments(data, comments, encoding) + + if read_dtype_via_object_chunks is None: + arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiprows=skiprows, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters) + + else: + # This branch reads the file into chunks of object arrays and then + # casts them to the desired actual dtype. This ensures correct + # string-length and datetime-unit discovery (as for `arr.astype()`). + # Due to chunking, certain error reports are less clear, currently. + if filelike: + data = iter(data) # cannot chunk when reading from file + + c_byte_converters = False + if read_dtype_via_object_chunks == "S": + c_byte_converters = True # Use latin1 rather than ascii + + chunks = [] + while max_rows != 0: + if max_rows < 0: + chunk_size = _loadtxt_chunksize + else: + chunk_size = min(_loadtxt_chunksize, max_rows) + + next_arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiprows=skiprows, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters, + c_byte_converters=c_byte_converters) + # Cast here already. We hope that this is better even for + # large files because the storage is more compact. It could + # be adapted (in principle the concatenate could cast). + chunks.append(next_arr.astype(read_dtype_via_object_chunks)) + + skiprows = 0 # Only have to skip for first chunk + if max_rows >= 0: + max_rows -= chunk_size + if len(next_arr) < chunk_size: + # There was less data than requested, so we are done. + break + + # Need at least one chunk, but if empty, the last one may have + # the wrong shape. + if len(chunks) > 1 and len(chunks[-1]) == 0: + del chunks[-1] + if len(chunks) == 1: + arr = chunks[0] + else: + arr = np.concatenate(chunks, axis=0) + + arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) + + if unpack: + # Handle unpack like np.loadtxt. + # XXX Check interaction with ndmin! + dt = arr.dtype + if dt.names is not None: + # For structured arrays, return an array for each field. + return [arr[field] for field in dt.names] + else: + return arr.T + else: + return arr + + @set_array_function_like_doc @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, @@ -1000,228 +1211,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, max_rows=max_rows, like=like ) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Nested functions used by loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def split_line(line: str): - """Chop off comments, strip, and split at delimiter.""" - for comment in comments: # Much faster than using a single regex. - line = line.split(comment, 1)[0] - line = line.strip('\r\n') - return line.split(delimiter) if line else [] + if delimiter is None: + delimiter = '' + elif isinstance(delimiter, bytes): + delimiter.decode("latin1") - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Main body of loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _ensure_ndmin_ndarray_check_param(ndmin) + if dtype is None: + dtype = np.float64 + comment = comments # Type conversions for Py3 convenience - if comments is not None: - if isinstance(comments, (str, bytes)): - comments = [comments] - comments = [_decode_line(x) for x in comments] - else: - comments = [] - - if delimiter is not None: - delimiter = _decode_line(delimiter) - - user_converters = converters - - byte_converters = False - if encoding == 'bytes': - encoding = None - byte_converters = True - - if usecols is not None: - # Copy usecols, allowing it to be a single int or a sequence of ints. - try: - usecols = list(usecols) - except TypeError: - usecols = [usecols] - for i, col_idx in enumerate(usecols): - try: - usecols[i] = opindex(col_idx) # Cast to builtin int now. - except TypeError as e: - e.args = ( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) - raise - if len(usecols) > 1: - usecols_getter = itemgetter(*usecols) - else: - # Get an iterable back, even if using a single column. - usecols_getter = lambda obj, c=usecols[0]: [obj[c]] + if comment is None: + comment = '' else: - usecols_getter = None - - # Make sure we're dealing with a proper dtype - dtype = np.dtype(dtype) - defconv = _getconv(dtype) - - dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype) - - fh_closing_ctx = contextlib.nullcontext() - try: - if isinstance(fname, os_PathLike): - fname = os_fspath(fname) - if _is_string_like(fname): - fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) - fencoding = getattr(fh, 'encoding', 'latin1') - line_iter = iter(fh) - fh_closing_ctx = contextlib.closing(fh) - else: - line_iter = iter(fname) - fencoding = getattr(fname, 'encoding', 'latin1') - try: - first_line = next(line_iter) - except StopIteration: - pass # Nothing matters if line_iter is empty. - else: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - if isinstance(first_line, bytes): - # Using latin1 matches _decode_line's behavior. - decoder = methodcaller( - "decode", - encoding if encoding is not None else "latin1") - line_iter = map(decoder, line_iter) - except TypeError as e: - raise ValueError( - f"fname must be a string, filehandle, list of strings,\n" - f"or generator. Got {type(fname)} instead." - ) from e - - with fh_closing_ctx: - - # input may be a python2 io stream - if encoding is not None: - fencoding = encoding - # we must assume local encoding - # TODO emit portability warning? - elif fencoding is None: - import locale - fencoding = locale.getpreferredencoding() - - # Skip the first `skiprows` lines - for i in range(skiprows): - next(line_iter) - - # Read until we find a line with some values, and use it to determine - # the need for decoding and estimate the number of columns. - for first_line in line_iter: - ncols = len(usecols or split_line(first_line)) - if ncols: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - break - else: # End of lines reached - ncols = len(usecols or []) - warnings.warn('loadtxt: Empty input file: "%s"' % fname, - stacklevel=2) - - line_iter = itertools.islice(line_iter, max_rows) - lineno_words_iter = filter( - itemgetter(1), # item[1] is words; filter skips empty lines. - enumerate(map(split_line, line_iter), 1 + skiprows)) - - # Now that we know ncols, create the default converters list, and - # set packing, if necessary. - if len(dtype_types) > 1: - # We're dealing with a structured array, each field of - # the dtype matches a column - converters = [_getconv(dt) for dt in dtype_types] - else: - # All fields have the same dtype; use specialized packers which are - # much faster than those using _loadtxt_pack_items. - converters = [defconv for i in range(ncols)] - if ncols == 1: - packer = itemgetter(0) - else: - def packer(row): return row - - # By preference, use the converters specified by the user - for i, conv in (user_converters or {}).items(): - if usecols: - try: - i = usecols.index(i) - except ValueError: - # Unused converter specified - continue - if byte_converters: - # converters may use decode to workaround numpy's old - # behaviour, so encode the string again (converters are only - # called with strings) before passing to the user converter. - def tobytes_first(conv, x): - return conv(x.encode("latin1")) - converters[i] = functools.partial(tobytes_first, conv) - else: - converters[i] = conv - - fencode = methodcaller("encode", fencoding) - converters = [conv if conv is not bytes else fencode - for conv in converters] - if len(set(converters)) == 1: - # Optimize single-type data. Note that this is only reached if - # `_getconv` returns equal callables (i.e. not local lambdas) on - # equal dtypes. - def convert_row(vals, _conv=converters[0]): - return [*map(_conv, vals)] - else: - def convert_row(vals): - return [conv(val) for conv, val in zip(converters, vals)] - - # read data in chunks and fill it into an array via resize - # over-allocating and shrinking the array later may be faster but is - # probably not relevant compared to the cost of actually reading and - # converting the data - X = None - while True: - chunk = [] - for lineno, words in itertools.islice( - lineno_words_iter, _loadtxt_chunksize): - if usecols_getter is not None: - words = usecols_getter(words) - elif len(words) != ncols: - raise ValueError( - f"Wrong number of columns at line {lineno}") - # Convert each value according to its column, then pack it - # according to the dtype's nesting, and store it. - chunk.append(packer(convert_row(words))) - if not chunk: # The islice is empty, i.e. we're done. - break - - if X is None: - X = np.array(chunk, dtype) - else: - nshape = list(X.shape) - pos = nshape[0] - nshape[0] += len(chunk) - X.resize(nshape, refcheck=False) - X[pos:, ...] = chunk - - if X is None: - X = np.array([], dtype) + if isinstance(comment, (str, bytes)): + comment = [comment] + comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment] - # Multicolumn data are returned with shape (1, N, M), i.e. - # (1, 1, M) for a single row - remove the singleton dimension there - if X.ndim == 3 and X.shape[:2] == (1, 1): - X.shape = (1, -1) + arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, + converters=converters, skiprows=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, quote='') - X = _ensure_ndmin_ndarray(X, ndmin=ndmin) - - if unpack: - if len(dtype_types) > 1: - # For structured arrays, return an array for each field. - return [X[field] for field in dtype.names] - else: - return X.T - else: - return X + return arr _loadtxt_with_like = array_function_dispatch( |