diff options
Diffstat (limited to 'numpy/lib')
-rw-r--r-- | numpy/lib/__init__.pyi | 2 | ||||
-rw-r--r-- | numpy/lib/_datasource.py | 5 | ||||
-rw-r--r-- | numpy/lib/arraysetops.py | 2 | ||||
-rw-r--r-- | numpy/lib/function_base.py | 43 | ||||
-rw-r--r-- | numpy/lib/npyio.py | 752 | ||||
-rw-r--r-- | numpy/lib/shape_base.py | 8 | ||||
-rw-r--r-- | numpy/lib/tests/test_function_base.py | 13 | ||||
-rw-r--r-- | numpy/lib/tests/test_io.py | 77 | ||||
-rw-r--r-- | numpy/lib/tests/test_loadtxt.py | 1006 | ||||
-rw-r--r-- | numpy/lib/utils.py | 3 |
10 files changed, 1545 insertions, 366 deletions
diff --git a/numpy/lib/__init__.pyi b/numpy/lib/__init__.pyi index 7338fc6d6..0e3da5b41 100644 --- a/numpy/lib/__init__.pyi +++ b/numpy/lib/__init__.pyi @@ -14,7 +14,7 @@ from numpy.lib import ( format as format, mixins as mixins, scimath as scimath, - stride_tricks as stride_stricks, + stride_tricks as stride_tricks, ) from numpy.lib._version import ( diff --git a/numpy/lib/_datasource.py b/numpy/lib/_datasource.py index 8201d3772..b7778234e 100644 --- a/numpy/lib/_datasource.py +++ b/numpy/lib/_datasource.py @@ -280,8 +280,9 @@ class DataSource: def _splitzipext(self, filename): """Split zip extension from filename and return filename. - *Returns*: - base, zip_ext : {tuple} + Returns + ------- + base, zip_ext : {tuple} """ diff --git a/numpy/lib/arraysetops.py b/numpy/lib/arraysetops.py index bd56b6975..d44e1a983 100644 --- a/numpy/lib/arraysetops.py +++ b/numpy/lib/arraysetops.py @@ -640,7 +640,7 @@ def _isin_dispatcher(element, test_elements, assume_unique=None, invert=None): @array_function_dispatch(_isin_dispatcher) def isin(element, test_elements, assume_unique=False, invert=False): """ - Calculates `element in test_elements`, broadcasting over `element` only. + Calculates ``element in test_elements``, broadcasting over `element` only. Returns a boolean array of the same shape as `element` that is True where an element of `element` is in `test_elements` and False otherwise. diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index d5b130b72..ff56196c3 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4190,7 +4190,7 @@ def quantile(a, 8. 'median_unbiased' 9. 'normal_unbiased' - The first three methods are discontiuous. NumPy further defines the + The first three methods are discontinuous. NumPy further defines the following discontinuous variations of the default 'linear' (7.) option: * 'lower' @@ -4241,10 +4241,10 @@ def quantile(a, same as the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the same as the maximum if ``q=1.0``. - This optional `method` parameter specifies the method to use when the + The optional `method` parameter specifies the method to use when the desired quantile lies between two data points ``i < j``. - If ``g`` is the fractional part of the index surrounded by ``i`` and - alpha and beta are correction constants modifying i and j. + If ``g`` is the fractional part of the index surrounded by ``i`` and ``j``, + and alpha and beta are correction constants modifying i and j: .. math:: i + g = (q - alpha) / ( n - alpha - beta + 1 ) @@ -4259,38 +4259,38 @@ def quantile(a, averaged_inverted_cdf: method 2 of H&F [1]_. - This method give discontinuous results: + This method gives discontinuous results: * if g > 0 ; then take j * if g = 0 ; then average between bounds closest_observation: method 3 of H&F [1]_. - This method give discontinuous results: + This method gives discontinuous results: * if g > 0 ; then take j * if g = 0 and index is odd ; then take j * if g = 0 and index is even ; then take i interpolated_inverted_cdf: method 4 of H&F [1]_. - This method give continuous results using: + This method gives continuous results using: * alpha = 0 * beta = 1 hazen: method 5 of H&F [1]_. - This method give continuous results using: + This method gives continuous results using: * alpha = 1/2 * beta = 1/2 weibull: method 6 of H&F [1]_. - This method give continuous results using: + This method gives continuous results using: * alpha = 0 * beta = 0 linear: method 7 of H&F [1]_. - This method give continuous results using: + This method gives continuous results using: * alpha = 1 * beta = 1 @@ -4298,7 +4298,7 @@ def quantile(a, method 8 of H&F [1]_. This method is probably the best method if the sample distribution function is unknown (see reference). - This method give continuous results using: + This method gives continuous results using: * alpha = 1/3 * beta = 1/3 @@ -4306,7 +4306,7 @@ def quantile(a, method 9 of H&F [1]_. This method is probably the best method if the sample distribution function is known to be normal. - This method give continuous results using: + This method gives continuous results using: * alpha = 3/8 * beta = 3/8 @@ -4411,7 +4411,7 @@ def _check_interpolation_as_method(method, interpolation, fname): f"the `interpolation=` argument to {fname} was renamed to " "`method=`, which has additional options.\n" "Users of the modes 'nearest', 'lower', 'higher', or " - "'midpoint' are encouraged to review the method they. " + "'midpoint' are encouraged to review the method they used. " "(Deprecated NumPy 1.22)", DeprecationWarning, stacklevel=4) if method != "linear": @@ -5094,6 +5094,18 @@ def delete(arr, obj, axis=None): return new if isinstance(obj, (int, integer)) and not isinstance(obj, bool): + single_value = True + else: + single_value = False + _obj = obj + obj = np.asarray(obj) + if obj.size == 0 and not isinstance(_obj, np.ndarray): + obj = obj.astype(intp) + elif obj.size == 1 and not isinstance(_obj, bool): + obj = obj.astype(intp).reshape(()) + single_value = True + + if single_value: # optimization for a single value if (obj < -N or obj >= N): raise IndexError( @@ -5110,11 +5122,6 @@ def delete(arr, obj, axis=None): slobj2[axis] = slice(obj+1, None) new[tuple(slobj)] = arr[tuple(slobj2)] else: - _obj = obj - obj = np.asarray(obj) - if obj.size == 0 and not isinstance(_obj, np.ndarray): - obj = obj.astype(intp) - if obj.dtype == bool: if obj.shape != (N,): raise ValueError('boolean array argument obj to delete ' diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6818ef81d..90424aab4 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -5,6 +5,7 @@ import itertools import warnings import weakref import contextlib +import operator from operator import itemgetter, index as opindex, methodcaller from collections.abc import Mapping @@ -13,6 +14,7 @@ from . import format from ._datasource import DataSource from numpy.core import overrides from numpy.core.multiarray import packbits, unpackbits +from numpy.core._multiarray_umath import _load_from_filelike from numpy.core.overrides import set_array_function_like_doc, set_module from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, @@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): zipf.close() -def _floatconv(x): - try: - return float(x) # The fastest path. - except ValueError: - if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10. - try: - return float.fromhex(x) - except ValueError: - pass - raise # Raise the original exception, which makes more sense. - - -_CONVERTERS = [ # These converters only ever get strs (not bytes) as input. - (np.bool_, lambda x: bool(int(x))), - (np.uint64, np.uint64), - (np.int64, np.int64), - (np.integer, lambda x: int(float(x))), - (np.longdouble, np.longdouble), - (np.floating, _floatconv), - (complex, lambda x: complex(x.replace('+-', '-'))), - (np.bytes_, methodcaller('encode', 'latin-1')), - (np.unicode_, str), -] - - -def _getconv(dtype): - """ - Find the correct dtype converter. Adapted from matplotlib. - - Even when a lambda is returned, it is defined at the toplevel, to allow - testing for equality and enabling optimization for single-type data. - """ - for base, conv in _CONVERTERS: - if issubclass(dtype.type, base): - return conv - return str - - -# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers -# lifted to the toplevel because recursive inner functions cause either -# GC-dependent reference loops (because they are closures over loadtxt's -# internal variables) or large overheads if using a manual trampoline to hide -# the recursive calls. - - -# not to be confused with the flatten_dtype we import... -def _loadtxt_flatten_dtype_internal(dt): - """Unpack a structured data-type, and produce a packer function.""" - if dt.names is None: - # If the dtype is flattened, return. - # If the dtype has a shape, the dtype occurs - # in the list more than once. - shape = dt.shape - if len(shape) == 0: - return ([dt.base], None) - else: - packing = [(shape[-1], list)] - if len(shape) > 1: - for dim in dt.shape[-2::-1]: - packing = [(dim*packing[0][0], packing*dim)] - return ([dt.base] * int(np.prod(dt.shape)), - functools.partial(_loadtxt_pack_items, packing)) - else: - types = [] - packing = [] - for field in dt.names: - tp, bytes = dt.fields[field] - flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp) - types.extend(flat_dt) - flat_packing = flat_packer.args[0] if flat_packer else None - # Avoid extra nesting for subarrays - if tp.ndim > 0: - packing.extend(flat_packing) - else: - packing.append((len(flat_dt), flat_packing)) - return (types, functools.partial(_loadtxt_pack_items, packing)) - - -def _loadtxt_pack_items(packing, items): - """Pack items into nested lists based on re-packing info.""" - if packing is None: - return items[0] - elif packing is tuple: - return tuple(items) - elif packing is list: - return list(items) - else: - start = 0 - ret = [] - for length, subpacking in packing: - ret.append( - _loadtxt_pack_items(subpacking, items[start:start+length])) - start += length - return tuple(ret) - def _ensure_ndmin_ndarray_check_param(ndmin): """Just checks if the param ndmin is supported on _ensure_ndmin_ndarray. It is intended to be used as @@ -853,17 +760,330 @@ def _ensure_ndmin_ndarray(a, *, ndmin: int): _loadtxt_chunksize = 50000 -def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None, - converters=None, skiprows=None, usecols=None, unpack=None, - ndmin=None, encoding=None, max_rows=None, *, like=None): +def _loadtxt_dispatcher( + fname, dtype=None, comments=None, delimiter=None, + converters=None, skiprows=None, usecols=None, unpack=None, + ndmin=None, encoding=None, max_rows=None, *, like=None): return (like,) +def _check_nonneg_int(value, name="argument"): + try: + operator.index(value) + except TypeError: + raise TypeError(f"{name} must be an integer") from None + if value < 0: + raise ValueError(f"{name} must be nonnegative") + + +def _preprocess_comments(iterable, comments, encoding): + """ + Generator that consumes a line iterated iterable and strips out the + multiple (or multi-character) comments from lines. + This is a pre-processing step to achieve feature parity with loadtxt + (we assume that this feature is a nieche feature). + """ + for line in iterable: + if isinstance(line, bytes): + # Need to handle conversion here, or the splitting would fail + line = line.decode(encoding) + + for c in comments: + line = line.split(c, 1)[0] + + yield line + + +# The number of rows we read in one go if confronted with a parametric dtype +_loadtxt_chunksize = 50000 + + +def _read(fname, *, delimiter=',', comment='#', quote='"', + imaginary_unit='j', usecols=None, skiplines=0, + max_rows=None, converters=None, ndmin=None, unpack=False, + dtype=np.float64, encoding="bytes"): + r""" + Read a NumPy array from a text file. + + Parameters + ---------- + fname : str or file object + The filename or the file to be read. + delimiter : str, optional + Field delimiter of the fields in line of the file. + Default is a comma, ','. If None any sequence of whitespace is + considered a delimiter. + comment : str or sequence of str or None, optional + Character that begins a comment. All text from the comment + character to the end of the line is ignored. + Multiple comments or multiple-character comment strings are supported, + but may be slower and `quote` must be empty if used. + Use None to disable all use of comments. + quote : str or None, optional + Character that is used to quote string fields. Default is '"' + (a double quote). Use None to disable quote support. + imaginary_unit : str, optional + Character that represent the imaginay unit `sqrt(-1)`. + Default is 'j'. + usecols : array_like, optional + A one-dimensional array of integer column numbers. These are the + columns from the file to be included in the array. If this value + is not given, all the columns are used. + skiplines : int, optional + Number of lines to skip before interpreting the data in the file. + max_rows : int, optional + Maximum number of rows of data to read. Default is to read the + entire file. + converters : dict or callable, optional + A function to parse all columns strings into the desired value, or + a dictionary mapping column number to a parser function. + E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. + Converters can also be used to provide a default value for missing + data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will + convert empty fields to 0. + Default: None + ndmin : int, optional + Minimum dimension of the array returned. + Allowed values are 0, 1 or 2. Default is 0. + unpack : bool, optional + If True, the returned array is transposed, so that arguments may be + unpacked using ``x, y, z = read(...)``. When used with a structured + data-type, arrays are returned for each field. Default is False. + dtype : numpy data type + A NumPy dtype instance, can be a structured dtype to map to the + columns of the file. + encoding : str, optional + Encoding used to decode the inputfile. The special value 'bytes' + (the default) enables backwards-compatible behavior for `converters`, + ensuring that inputs to the converter functions are encoded + bytes objects. The special value 'bytes' has no additional effect if + ``converters=None``. If encoding is ``'bytes'`` or ``None``, the + default system encoding is used. + + Returns + ------- + ndarray + NumPy array. + + Examples + -------- + First we create a file for the example. + + >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n' + >>> with open('example1.csv', 'w') as f: + ... f.write(s1) + >>> a1 = read_from_filename('example1.csv') + >>> a1 + array([[1., 2., 3.], + [4., 5., 6.]]) + + The second example has columns with different data types, so a + one-dimensional array with a structured data type is returned. + The tab character is used as the field delimiter. + + >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n' + >>> with open('example2.tsv', 'w') as f: + ... f.write(s2) + >>> a2 = read_from_filename('example2.tsv', delimiter='\t') + >>> a2 + array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')], + dtype=[('f0', '<f8'), ('f1', 'u1'), ('f2', 'S5')]) + """ + # Handle special 'bytes' keyword for encoding + byte_converters = False + if encoding == 'bytes': + encoding = None + byte_converters = True + + if dtype is None: + raise TypeError("a dtype must be provided.") + dtype = np.dtype(dtype) + + read_dtype_via_object_chunks = None + if dtype.kind in 'SUM' and ( + dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): + # This is a legacy "flexible" dtype. We do not truly support + # parametric dtypes currently (no dtype discovery step in the core), + # but have to support these for backward compatibility. + read_dtype_via_object_chunks = dtype + dtype = np.dtype(object) + + if usecols is not None: + # Allow usecols to be a single int or a sequence of ints, the C-code + # handles the rest + try: + usecols = list(usecols) + except TypeError: + usecols = [usecols] + + _ensure_ndmin_ndarray_check_param(ndmin) + + if comment is None: + comments = None + else: + # assume comments are a sequence of strings + if "" in comment: + raise ValueError( + "comments cannot be an empty string. Use comments=None to " + "disable comments." + ) + comments = tuple(comment) + comment = None + if len(comments) == 0: + comments = None # No comments at all + elif len(comments) == 1: + # If there is only one comment, and that comment has one character, + # the normal parsing can deal with it just fine. + if isinstance(comments[0], str) and len(comments[0]) == 1: + comment = comments[0] + comments = None + else: + # Input validation if there are multiple comment characters + if delimiter in comments: + raise TypeError( + f"Comment characters '{comments}' cannot include the " + f"delimiter '{delimiter}'" + ) + + # comment is now either a 1 or 0 character string or a tuple: + if comments is not None: + # Note: An earlier version support two character comments (and could + # have been extended to multiple characters, we assume this is + # rare enough to not optimize for. + if quote is not None: + raise ValueError( + "when multiple comments or a multi-character comment is " + "given, quotes are not supported. In this case quotechar " + "must be set to None.") + + if len(imaginary_unit) != 1: + raise ValueError('len(imaginary_unit) must be 1.') + + _check_nonneg_int(skiplines) + if max_rows is not None: + _check_nonneg_int(max_rows) + else: + # Passing -1 to the C code means "read the entire file". + max_rows = -1 + + fh_closing_ctx = contextlib.nullcontext() + filelike = False + try: + if isinstance(fname, os.PathLike): + fname = os.fspath(fname) + if isinstance(fname, str): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + if encoding is None: + encoding = getattr(fh, 'encoding', 'latin1') + + fh_closing_ctx = contextlib.closing(fh) + data = fh + filelike = True + else: + if encoding is None: + encoding = getattr(fname, 'encoding', 'latin1') + data = iter(fname) + except TypeError as e: + raise ValueError( + f"fname must be a string, filehandle, list of strings,\n" + f"or generator. Got {type(fname)} instead.") from e + + with fh_closing_ctx: + if comments is not None: + if filelike: + data = iter(data) + filelike = False + data = _preprocess_comments(data, comments, encoding) + + if read_dtype_via_object_chunks is None: + arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiplines=skiplines, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters) + + else: + # This branch reads the file into chunks of object arrays and then + # casts them to the desired actual dtype. This ensures correct + # string-length and datetime-unit discovery (like `arr.astype()`). + # Due to chunking, certain error reports are less clear, currently. + if filelike: + data = iter(data) # cannot chunk when reading from file + + c_byte_converters = False + if read_dtype_via_object_chunks == "S": + c_byte_converters = True # Use latin1 rather than ascii + + chunks = [] + while max_rows != 0: + if max_rows < 0: + chunk_size = _loadtxt_chunksize + else: + chunk_size = min(_loadtxt_chunksize, max_rows) + + next_arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiplines=skiplines, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters, + c_byte_converters=c_byte_converters) + # Cast here already. We hope that this is better even for + # large files because the storage is more compact. It could + # be adapted (in principle the concatenate could cast). + chunks.append(next_arr.astype(read_dtype_via_object_chunks)) + + skiprows = 0 # Only have to skip for first chunk + if max_rows >= 0: + max_rows -= chunk_size + if len(next_arr) < chunk_size: + # There was less data than requested, so we are done. + break + + # Need at least one chunk, but if empty, the last one may have + # the wrong shape. + if len(chunks) > 1 and len(chunks[-1]) == 0: + del chunks[-1] + if len(chunks) == 1: + arr = chunks[0] + else: + arr = np.concatenate(chunks, axis=0) + + # NOTE: ndmin works as advertised for structured dtypes, but normally + # these would return a 1D result plus the structured dimension, + # so ndmin=2 adds a third dimension even when no squeezing occurs. + # A `squeeze=False` could be a better solution (pandas uses squeeze). + arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) + + if arr.shape: + if arr.shape[0] == 0: + warnings.warn( + f'loadtxt: input contained no data: "{fname}"', + category=UserWarning, + stacklevel=3 + ) + + if unpack: + # Unpack structured dtypes if requested: + dt = arr.dtype + if dt.names is not None: + # For structured arrays, return an array for each field. + return [arr[field] for field in dt.names] + else: + return arr.T + else: + return arr + + @set_array_function_like_doc @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0, encoding='bytes', max_rows=None, *, like=None): + ndmin=0, encoding='bytes', max_rows=None, *, quotechar=None, + like=None): r""" Load data from a text file. @@ -882,19 +1102,20 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. - comments : str or sequence of str, optional + comments : str or sequence of str or None, optional The characters or list of characters used to indicate the start of a comment. None implies no comments. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is '#'. delimiter : str, optional The string used to separate values. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is whitespace. - converters : dict, optional - A dictionary mapping column number to a function that will parse the - column string into the desired value. E.g., if column 0 is a date - string: ``converters = {0: datestr2num}``. Converters can also be - used to provide a default value for missing data (but see also - `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``. + converters : dict or callable, optional + A function to parse all columns strings into the desired value, or + a dictionary mapping column number to a parser function. + E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. + Converters can also be used to provide a default value for missing + data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will + convert empty fields to 0. Default: None. skiprows : int, optional Skip the first `skiprows` lines, including comments; default: 0. @@ -932,6 +1153,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, is to read all the lines. .. versionadded:: 1.16.0 + quotechar : unicode character or None, optional + The character used to denote the start and end of a quoted item. + Occurrences of the delimiter or comment characters are ignored within + a quoted item. The default value is ``quotechar=None``, which means + quoting support is disabled. + + If two consecutive instances of `quotechar` are found within a quoted + field, the first is treated as an escape character. See examples. + + .. versionadded:: 1.23.0 ${ARRAY_FUNCTION_LIKE} .. versionadded:: 1.20.0 @@ -979,6 +1210,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> y array([2., 4.]) + The `converters` argument is used to specify functions to preprocess the + text prior to parsing. `converters` can be a dictionary that maps + preprocessing functions to each column: + + >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n") + >>> conv = { + ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0 + ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1 + ... } + >>> np.loadtxt(s, delimiter=",", converters=conv) + array([[1., 3.], + [3., 5.]]) + + `converters` can be a callable instead of a dictionary, in which case it + is applied to all columns: + + >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE") + >>> import functools + >>> conv = functools.partial(int, base=16) + >>> np.loadtxt(s, converters=conv) + array([[222., 173.], + [192., 222.]]) + This example shows how `converters` can be used to convert a field with a trailing minus sign into a negative number. @@ -986,242 +1240,90 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> def conv(fld): ... return -float(fld[:-1]) if fld.endswith(b'-') else float(fld) ... - >>> np.loadtxt(s, converters={0: conv, 1: conv}) + >>> np.loadtxt(s, converters=conv) array([[ 10.01, -31.25], [ 19.22, 64.31], [-17.57, 63.94]]) - """ - - if like is not None: - return _loadtxt_with_like( - fname, dtype=dtype, comments=comments, delimiter=delimiter, - converters=converters, skiprows=skiprows, usecols=usecols, - unpack=unpack, ndmin=ndmin, encoding=encoding, - max_rows=max_rows, like=like - ) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Nested functions used by loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Using a callable as the converter can be particularly useful for handling + values with different formatting, e.g. floats with underscores: - def split_line(line: str): - """Chop off comments, strip, and split at delimiter.""" - for comment in comments: # Much faster than using a single regex. - line = line.split(comment, 1)[0] - line = line.strip('\r\n') - return line.split(delimiter) if line else [] + >>> s = StringIO("1 2.7 100_000") + >>> np.loadtxt(s, converters=float) + array([1.e+00, 2.7e+00, 1.e+05]) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Main body of loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _ensure_ndmin_ndarray_check_param(ndmin) - - # Type conversions for Py3 convenience - if comments is not None: - if isinstance(comments, (str, bytes)): - comments = [comments] - comments = [_decode_line(x) for x in comments] - else: - comments = [] - - if delimiter is not None: - delimiter = _decode_line(delimiter) - - user_converters = converters - - byte_converters = False - if encoding == 'bytes': - encoding = None - byte_converters = True - - if usecols is not None: - # Copy usecols, allowing it to be a single int or a sequence of ints. - try: - usecols = list(usecols) - except TypeError: - usecols = [usecols] - for i, col_idx in enumerate(usecols): - try: - usecols[i] = opindex(col_idx) # Cast to builtin int now. - except TypeError as e: - e.args = ( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) - raise - if len(usecols) > 1: - usecols_getter = itemgetter(*usecols) - else: - # Get an iterable back, even if using a single column. - usecols_getter = lambda obj, c=usecols[0]: [obj[c]] - else: - usecols_getter = None + This idea can be extended to automatically handle values specified in + many different formats: - # Make sure we're dealing with a proper dtype - dtype = np.dtype(dtype) - defconv = _getconv(dtype) + >>> def conv(val): + ... try: + ... return float(val) + ... except ValueError: + ... return float.fromhex(val) + >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2") + >>> np.loadtxt(s, delimiter=",", converters=conv, encoding=None) + array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00]) - dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype) + Note that with the default ``encoding="bytes"``, the inputs to the + converter function are latin-1 encoded byte strings. To deactivate the + implicit encoding prior to conversion, use ``encoding=None`` - fh_closing_ctx = contextlib.nullcontext() - try: - if isinstance(fname, os_PathLike): - fname = os_fspath(fname) - if _is_string_like(fname): - fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) - fencoding = getattr(fh, 'encoding', 'latin1') - line_iter = iter(fh) - fh_closing_ctx = contextlib.closing(fh) - else: - line_iter = iter(fname) - fencoding = getattr(fname, 'encoding', 'latin1') - try: - first_line = next(line_iter) - except StopIteration: - pass # Nothing matters if line_iter is empty. - else: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - if isinstance(first_line, bytes): - # Using latin1 matches _decode_line's behavior. - decoder = methodcaller( - "decode", - encoding if encoding is not None else "latin1") - line_iter = map(decoder, line_iter) - except TypeError as e: - raise ValueError( - f"fname must be a string, filehandle, list of strings,\n" - f"or generator. Got {type(fname)} instead." - ) from e + >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94') + >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x) + >>> np.loadtxt(s, converters=conv, encoding=None) + array([[ 10.01, -31.25], + [ 19.22, 64.31], + [-17.57, 63.94]]) - with fh_closing_ctx: + Support for quoted fields is enabled with the `quotechar` parameter. + Comment and delimiter characters are ignored when they appear within a + quoted item delineated by `quotechar`: - # input may be a python2 io stream - if encoding is not None: - fencoding = encoding - # we must assume local encoding - # TODO emit portability warning? - elif fencoding is None: - import locale - fencoding = locale.getpreferredencoding() - - # Skip the first `skiprows` lines - for i in range(skiprows): - next(line_iter) - - # Read until we find a line with some values, and use it to determine - # the need for decoding and estimate the number of columns. - for first_line in line_iter: - ncols = len(usecols or split_line(first_line)) - if ncols: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - break - else: # End of lines reached - ncols = len(usecols or []) - warnings.warn('loadtxt: Empty input file: "%s"' % fname, - stacklevel=2) - - line_iter = itertools.islice(line_iter, max_rows) - lineno_words_iter = filter( - itemgetter(1), # item[1] is words; filter skips empty lines. - enumerate(map(split_line, line_iter), 1 + skiprows)) - - # Now that we know ncols, create the default converters list, and - # set packing, if necessary. - if len(dtype_types) > 1: - # We're dealing with a structured array, each field of - # the dtype matches a column - converters = [_getconv(dt) for dt in dtype_types] - else: - # All fields have the same dtype; use specialized packers which are - # much faster than those using _loadtxt_pack_items. - converters = [defconv for i in range(ncols)] - if ncols == 1: - packer = itemgetter(0) - else: - def packer(row): return row + >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n') + >>> dtype = np.dtype([("label", "U12"), ("value", float)]) + >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"') + array([('alpha, #42', 10.), ('beta, #64', 2.)], + dtype=[('label', '<U12'), ('value', '<f8')]) - # By preference, use the converters specified by the user - for i, conv in (user_converters or {}).items(): - if usecols: - try: - i = usecols.index(i) - except ValueError: - # Unused converter specified - continue - if byte_converters: - # converters may use decode to workaround numpy's old - # behaviour, so encode the string again (converters are only - # called with strings) before passing to the user converter. - def tobytes_first(conv, x): - return conv(x.encode("latin1")) - converters[i] = functools.partial(tobytes_first, conv) - else: - converters[i] = conv - - fencode = methodcaller("encode", fencoding) - converters = [conv if conv is not bytes else fencode - for conv in converters] - if len(set(converters)) == 1: - # Optimize single-type data. Note that this is only reached if - # `_getconv` returns equal callables (i.e. not local lambdas) on - # equal dtypes. - def convert_row(vals, _conv=converters[0]): - return [*map(_conv, vals)] - else: - def convert_row(vals): - return [conv(val) for conv, val in zip(converters, vals)] - - # read data in chunks and fill it into an array via resize - # over-allocating and shrinking the array later may be faster but is - # probably not relevant compared to the cost of actually reading and - # converting the data - X = None - while True: - chunk = [] - for lineno, words in itertools.islice( - lineno_words_iter, _loadtxt_chunksize): - if usecols_getter is not None: - words = usecols_getter(words) - elif len(words) != ncols: - raise ValueError( - f"Wrong number of columns at line {lineno}") - # Convert each value according to its column, then pack it - # according to the dtype's nesting, and store it. - chunk.append(packer(convert_row(words))) - if not chunk: # The islice is empty, i.e. we're done. - break + Two consecutive quote characters within a quoted field are treated as a + single escaped character: - if X is None: - X = np.array(chunk, dtype) - else: - nshape = list(X.shape) - pos = nshape[0] - nshape[0] += len(chunk) - X.resize(nshape, refcheck=False) - X[pos:, ...] = chunk + >>> s = StringIO('"Hello, my name is ""Monty""!"') + >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"') + array('Hello, my name is "Monty"!', dtype='<U26') - if X is None: - X = np.array([], dtype) + """ - # Multicolumn data are returned with shape (1, N, M), i.e. - # (1, 1, M) for a single row - remove the singleton dimension there - if X.ndim == 3 and X.shape[:2] == (1, 1): - X.shape = (1, -1) + if like is not None: + return _loadtxt_with_like( + fname, dtype=dtype, comments=comments, delimiter=delimiter, + converters=converters, skiprows=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, like=like + ) - X = _ensure_ndmin_ndarray(X, ndmin=ndmin) + if isinstance(delimiter, bytes): + delimiter.decode("latin1") - if unpack: - if len(dtype_types) > 1: - # For structured arrays, return an array for each field. - return [X[field] for field in dtype.names] - else: - return X.T - else: - return X + if dtype is None: + dtype = np.float64 + + comment = comments + # Control character type conversions for Py3 convenience + if comment is not None: + if isinstance(comment, (str, bytes)): + comment = [comment] + comment = [ + x.decode('latin1') if isinstance(x, bytes) else x for x in comment] + if isinstance(delimiter, bytes): + delimiter = delimiter.decode('latin1') + + arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, + converters=converters, skiplines=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, quote=quotechar) + + return arr _loadtxt_with_like = array_function_dispatch( diff --git a/numpy/lib/shape_base.py b/numpy/lib/shape_base.py index a3fbee3d5..b600b70f6 100644 --- a/numpy/lib/shape_base.py +++ b/numpy/lib/shape_base.py @@ -885,7 +885,7 @@ def hsplit(ary, indices_or_sections): Please refer to the `split` documentation. `hsplit` is equivalent to `split` with ``axis=1``, the array is always split along the second - axis regardless of the array dimension. + axis except for 1-D arrays, where it is split at ``axis=0``. See Also -------- @@ -933,6 +933,12 @@ def hsplit(ary, indices_or_sections): array([[[2., 3.]], [[6., 7.]]])] + With a 1-D array, the split is along axis 0. + + >>> x = np.array([0, 1, 2, 3, 4, 5]) + >>> np.hsplit(x, 2) + [array([0, 1, 2]), array([3, 4, 5])] + """ if _nx.ndim(ary) == 0: raise ValueError('hsplit only works on arrays of 1 or more dimensions') diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index b67a31b18..874754a64 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -890,6 +890,19 @@ class TestDelete: with pytest.raises(IndexError): np.delete([0, 1, 2], np.array([], dtype=float)) + def test_single_item_array(self): + a_del = delete(self.a, 1) + a_del_arr = delete(self.a, np.array([1])) + a_del_lst = delete(self.a, [1]) + a_del_obj = delete(self.a, np.array([1], dtype=object)) + assert_equal(a_del, a_del_arr, a_del_lst, a_del_obj) + + nd_a_del = delete(self.nd_a, 1, axis=1) + nd_a_del_arr = delete(self.nd_a, np.array([1]), axis=1) + nd_a_del_lst = delete(self.nd_a, [1], axis=1) + nd_a_del_obj = delete(self.nd_a, np.array([1], dtype=object), axis=1) + assert_equal(nd_a_del, nd_a_del_arr, nd_a_del_lst, nd_a_del_obj) + class TestGradient: diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index b9b10bc06..a2758123b 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -695,7 +695,7 @@ class TestLoadTxt(LoadTxtBase): assert_array_equal(x, a) d = TextIO() - d.write('M 64.0 75.0\nF 25.0 60.0') + d.write('M 64 75.0\nF 25 60.0') d.seek(0) mydescriptor = {'names': ('gender', 'age', 'weight'), 'formats': ('S1', 'i4', 'f4')} @@ -779,6 +779,8 @@ class TestLoadTxt(LoadTxtBase): a = np.array([[1, 2, 3], [4, 5, 6]], int) assert_array_equal(x, a) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_comments_multi_chars(self): c = TextIO() c.write('/* comment\n1,2,3,5\n') @@ -871,16 +873,27 @@ class TestLoadTxt(LoadTxtBase): bogus_idx = 1.5 assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=bogus_idx ) assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=[0, bogus_idx, 0] ) + def test_bad_usecols(self): + with pytest.raises(OverflowError): + np.loadtxt(["1\n"], usecols=[2**64], delimiter=",") + with pytest.raises((ValueError, OverflowError)): + # Overflow error on 32bit platforms + np.loadtxt(["1\n"], usecols=[2**62], delimiter=",") + with pytest.raises(TypeError, + match="If a structured dtype .*. But 1 usecols were given and " + "the number of fields is 3."): + np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",") + def test_fancy_dtype(self): c = TextIO() c.write('1,2,3.0\n4,5,6.0\n') @@ -919,8 +932,7 @@ class TestLoadTxt(LoadTxtBase): assert_array_equal(x, a) def test_empty_file(self): - with suppress_warnings() as sup: - sup.filter(message="loadtxt: Empty input file:") + with pytest.warns(UserWarning, match="input contained no data"): c = TextIO() x = np.loadtxt(c) assert_equal(x.shape, (0,)) @@ -981,29 +993,32 @@ class TestLoadTxt(LoadTxtBase): c.write(inp) for dt in [float, np.float32]: c.seek(0) - res = np.loadtxt(c, dtype=dt) + res = np.loadtxt( + c, dtype=dt, converters=float.fromhex, encoding="latin1") assert_equal(res, tgt, err_msg="%s" % dt) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_default_float_converter_no_default_hex_conversion(self): """ Ensure that fromhex is only used for values with the correct prefix and is not called by default. Regression test related to gh-19598. """ c = TextIO("a b c") - with pytest.raises( - ValueError, match="could not convert string to float" - ): + with pytest.raises(ValueError, + match=".*convert string 'a' to float64 at row 0, column 1"): np.loadtxt(c) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_default_float_converter_exception(self): """ Ensure that the exception message raised during failed floating point conversion is correct. Regression test related to gh-19598. """ c = TextIO("qrs tuv") # Invalid values for default float converter - with pytest.raises( - ValueError, match="could not convert string to float" - ): + with pytest.raises(ValueError, + match="could not convert string 'qrs' to float64"): np.loadtxt(c) def test_from_complex(self): @@ -1099,8 +1114,7 @@ class TestLoadTxt(LoadTxtBase): assert_(x.shape == (3,)) # Test ndmin kw with empty file. - with suppress_warnings() as sup: - sup.filter(message="loadtxt: Empty input file:") + with pytest.warns(UserWarning, match="input contained no data"): f = TextIO() assert_(np.loadtxt(f, ndmin=2).shape == (0, 1,)) assert_(np.loadtxt(f, ndmin=1).shape == (0,)) @@ -1132,8 +1146,8 @@ class TestLoadTxt(LoadTxtBase): @pytest.mark.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968', reason="Wrong preferred encoding") def test_binary_load(self): - butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\ - b"20,2,3,\xc3\x95scar\n\r" + butf8 = b"5,6,7,\xc3\x95scarscar\r\n15,2,3,hello\r\n"\ + b"20,2,3,\xc3\x95scar\r\n" sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines() with temppath() as path: with open(path, "wb") as f: @@ -1196,6 +1210,30 @@ class TestLoadTxt(LoadTxtBase): a = np.array([[1, 2, 3, 5], [4, 5, 7, 8], [2, 1, 4, 5]], int) assert_array_equal(x, a) + @pytest.mark.parametrize(["skip", "data"], [ + (1, ["ignored\n", "1,2\n", "\n", "3,4\n"]), + # "Bad" lines that do not end in newlines: + (1, ["ignored", "1,2", "", "3,4"]), + (1, StringIO("ignored\n1,2\n\n3,4")), + # Same as above, but do not skip any lines: + (0, ["-1,0\n", "1,2\n", "\n", "3,4\n"]), + (0, ["-1,0", "1,2", "", "3,4"]), + (0, StringIO("-1,0\n1,2\n\n3,4"))]) + def test_max_rows_empty_lines(self, skip, data): + with pytest.warns(UserWarning, + match=f"Input line 3.*max_rows={3-skip}"): + res = np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",", + max_rows=3-skip) + assert_array_equal(res, [[-1, 0], [1, 2], [3, 4]][skip:]) + + if isinstance(data, StringIO): + data.seek(0) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with pytest.raises(UserWarning): + np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",", + max_rows=3-skip) class Testfromregex: def test_record(self): @@ -2397,6 +2435,13 @@ M 33 21.99 assert_equal(test['f1'], 17179869184) assert_equal(test['f2'], 1024) + def test_unpack_float_data(self): + txt = TextIO("1,2,3\n4,5,6\n7,8,9\n0.0,1.0,2.0") + a, b, c = np.loadtxt(txt, delimiter=",", unpack=True) + assert_array_equal(a, np.array([1.0, 4.0, 7.0, 0.0])) + assert_array_equal(b, np.array([2.0, 5.0, 8.0, 1.0])) + assert_array_equal(c, np.array([3.0, 6.0, 9.0, 2.0])) + def test_unpack_structured(self): # Regression test for gh-4341 # Unpacking should work on structured arrays diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py new file mode 100644 index 000000000..f50e3b8ad --- /dev/null +++ b/numpy/lib/tests/test_loadtxt.py @@ -0,0 +1,1006 @@ +""" +Tests specific to `np.loadtxt` added during the move of loadtxt to be backed +by C code. +These tests complement those found in `test_io.py`. +""" + +import sys +import pytest +from tempfile import NamedTemporaryFile, mkstemp +from io import StringIO + +import numpy as np +from numpy.ma.testutils import assert_equal +from numpy.testing import assert_array_equal, HAS_REFCOUNT, IS_PYPY + + +def test_scientific_notation(): + """Test that both 'e' and 'E' are parsed correctly.""" + data = StringIO( + ( + "1.0e-1,2.0E1,3.0\n" + "4.0e-2,5.0E-1,6.0\n" + "7.0e-3,8.0E1,9.0\n" + "0.0e-4,1.0E-1,2.0" + ) + ) + expected = np.array( + [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] + ) + assert_array_equal(np.loadtxt(data, delimiter=","), expected) + + +@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) +def test_comment_multiple_chars(comment): + content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" + txt = StringIO(content.replace("#", comment)) + a = np.loadtxt(txt, delimiter=",", comments=comment) + assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) + + +@pytest.fixture +def mixed_types_structured(): + """ + Fixture providing hetergeneous input data with a structured dtype, along + with the associated structured array. + """ + data = StringIO( + ( + "1000;2.4;alpha;-34\n" + "2000;3.1;beta;29\n" + "3500;9.9;gamma;120\n" + "4090;8.1;delta;0\n" + "5001;4.4;epsilon;-99\n" + "6543;7.8;omega;-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + return data, dtype, expected + + +@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) +def test_structured_dtype_and_skiprows_no_empty_lines( + skiprows, mixed_types_structured): + data, dtype, expected = mixed_types_structured + a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) + assert_array_equal(a, expected[skiprows:]) + + +def test_unpack_structured(mixed_types_structured): + data, dtype, expected = mixed_types_structured + + a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) + assert_array_equal(a, expected["f0"]) + assert_array_equal(b, expected["f1"]) + assert_array_equal(c, expected["f2"]) + assert_array_equal(d, expected["f3"]) + + +def test_structured_dtype_with_shape(): + dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) + data = StringIO("0,1,2,3\n6,7,8,9\n") + expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) + + +def test_structured_dtype_with_multi_shape(): + dtype = np.dtype([("a", "u1", (2, 2))]) + data = StringIO("0 1 2 3\n") + expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) + assert_array_equal(np.loadtxt(data, dtype=dtype), expected) + + +def test_nested_structured_subarray(): + # Test from gh-16678 + point = np.dtype([('x', float), ('y', float)]) + dt = np.dtype([('code', int), ('points', point, (2,))]) + data = StringIO("100,1,2,3,4\n200,5,6,7,8\n") + expected = np.array( + [ + (100, [(1., 2.), (3., 4.)]), + (200, [(5., 6.), (7., 8.)]), + ], + dtype=dt + ) + assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) + + +def test_structured_dtype_offsets(): + # An aligned structured dtype will have additional padding + dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) + data = StringIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") + expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_negative_row_limits(param): + """skiprows and max_rows should raise for negative parameters.""" + with pytest.raises(ValueError, match="argument must be nonnegative"): + np.loadtxt("foo.bar", **{param: -3}) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_noninteger_row_limits(param): + with pytest.raises(TypeError, match="argument must be an integer"): + np.loadtxt("foo.bar", **{param: 1.0}) + + +@pytest.mark.parametrize( + "data, shape", + [ + ("1 2 3 4 5\n", (1, 5)), # Single row + ("1\n2\n3\n4\n5\n", (5, 1)), # Single column + ] +) +def test_ndmin_single_row_or_col(data, shape): + arr = np.array([1, 2, 3, 4, 5]) + arr2d = arr.reshape(shape) + + assert_array_equal(np.loadtxt(StringIO(data), dtype=int), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=0), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=1), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=2), arr2d) + + +@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) +def test_bad_ndmin(badval): + with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): + np.loadtxt("foo.bar", ndmin=badval) + + +@pytest.mark.parametrize( + "ws", + ( + " ", # space + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_blank_lines_spaces_delimit(ws): + txt = StringIO( + f"1 2{ws}30\n\n{ws}\n" + f"4 5 60{ws}\n {ws} \n" + f"7 8 {ws} 90\n # comment\n" + f"3 2 1" + ) + # NOTE: It is unclear that the ` # comment` should succeed. Except + # for delimiter=None, which should use any whitespace (and maybe + # should just be implemented closer to Python + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected + ) + + +def test_blank_lines_normal_delimiter(): + txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected + ) + + +@pytest.mark.parametrize("dtype", (float, object)) +def test_maxrows_no_blank_lines(dtype): + txt = StringIO("1.5,2.5\n3.0,4.0\n5.5,6.0") + res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) + assert_equal(res.dtype, dtype) + assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) +def test_exception_message_bad_values(dtype): + txt = StringIO("1,2\n3,XXX\n5,6") + msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + +def test_converters_negative_indices(): + txt = StringIO('1.5,2.5\n3.0,XXX\n5.5,6.0') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) + res = np.loadtxt( + txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None + ) + assert_equal(res, expected) + + +def test_converters_negative_indices_with_usecols(): + txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) + res = np.loadtxt( + txt, + dtype=np.float64, + delimiter=",", + converters=conv, + usecols=[0, -1], + encoding=None, + ) + assert_equal(res, expected) + + # Second test with variable number of rows: + res = np.loadtxt(StringIO('''0,1,2\n0,1,2,3,4'''), delimiter=",", + usecols=[0, -1], converters={-1: (lambda x: -1)}) + assert_array_equal(res, [[0, -1], [0, -1]]) + +def test_ragged_usecols(): + # usecols, and negative ones, work even with varying number of columns. + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + expected = np.array([[0, 0], [0, 0], [0, 0]]) + res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + assert_equal(res, expected) + + txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n") + with pytest.raises(ValueError, + match="invalid column index -2 at row 1 with 2 columns"): + # There is no -2 column in the second row: + np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + + +def test_empty_usecols(): + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) + assert res.shape == (3,) + assert res.dtype == np.dtype([]) + + +@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) +@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) +def test_large_unicode_characters(c1, c2): + # c1 and c2 span ascii, 16bit and 32bit range. + txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") + res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") + expected = np.array( + [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], + dtype=np.dtype('U12') + ) + assert_equal(res, expected) + + +def test_unicode_with_converter(): + txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") + conv = {0: lambda s: s.upper()} + res = np.loadtxt( + txt, + dtype=np.dtype("U12"), + converters=conv, + delimiter=",", + encoding=None + ) + expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) + assert_equal(res, expected) + + +def test_converter_with_structured_dtype(): + txt = StringIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') + dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) + conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} + res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) + expected = np.array( + [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt + ) + assert_equal(res, expected) + + +def test_converter_with_unicode_dtype(): + """ + With the default 'bytes' encoding, tokens are encoded prior to being + passed to the converter. This means that the output of the converter may + be bytes instead of unicode as expected by `read_rows`. + + This test checks that outputs from the above scenario are properly decoded + prior to parsing by `read_rows`. + """ + txt = StringIO('abc,def\nrst,xyz') + conv = bytes.upper + res = np.loadtxt( + txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") + expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) + assert_equal(res, expected) + + +def test_read_huge_row(): + row = "1.5, 2.5," * 50000 + row = row[:-1] + "\n" + txt = StringIO(row * 2) + res = np.loadtxt(txt, delimiter=",", dtype=float) + assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) + + +@pytest.mark.parametrize("dtype", "edfgFDG") +def test_huge_float(dtype): + # Covers a non-optimized path that is rarely taken: + field = "0" * 1000 + ".123456789" + dtype = np.dtype(dtype) + value = np.loadtxt([field], dtype=dtype)[()] + assert value == dtype.type("0.123456789") + + +@pytest.mark.parametrize( + ("given_dtype", "expected_dtype"), + [ + ("S", np.dtype("S5")), + ("U", np.dtype("U5")), + ], +) +def test_string_no_length_given(given_dtype, expected_dtype): + """ + The given dtype is just 'S' or 'U' with no length. In these cases, the + length of the resulting dtype is determined by the longest string found + in the file. + """ + txt = StringIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") + res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") + expected = np.array( + [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype + ) + assert_equal(res, expected) + assert_equal(res.dtype, expected_dtype) + + +def test_float_conversion(): + """ + Some tests that the conversion to float64 works as accurately as the + Python built-in `float` function. In a naive version of the float parser, + these strings resulted in values that were off by an ULP or two. + """ + strings = [ + '0.9999999999999999', + '9876543210.123456', + '5.43215432154321e+300', + '0.901', + '0.333', + ] + txt = StringIO('\n'.join(strings)) + res = np.loadtxt(txt) + expected = np.array([float(s) for s in strings]) + assert_equal(res, expected) + + +def test_bool(): + # Simple test for bool via integer + txt = StringIO("1, 0\n10, -1") + res = np.loadtxt(txt, dtype=bool, delimiter=",") + assert res.dtype == bool + assert_array_equal(res, [[True, False], [True, True]]) + # Make sure we use only 1 and 0 on the byte level: + assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_integer_signs(dtype): + dtype = np.dtype(dtype) + assert np.loadtxt(["+2"], dtype=dtype) == 2 + if dtype.kind == "u": + with pytest.raises(ValueError): + np.loadtxt(["-1\n"], dtype=dtype) + else: + assert np.loadtxt(["-2\n"], dtype=dtype) == -2 + + for sign in ["++", "+-", "--", "-+"]: + with pytest.raises(ValueError): + np.loadtxt([f"{sign}2\n"], dtype=dtype) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_implicit_cast_float_to_int_fails(dtype): + txt = StringIO("1.0, 2.1, 3.7\n4, 5, 6") + with pytest.raises(ValueError): + np.loadtxt(txt, dtype=dtype, delimiter=",") + +@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) +@pytest.mark.parametrize("with_parens", (False, True)) +def test_complex_parsing(dtype, with_parens): + s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" + if not with_parens: + s = s.replace("(", "").replace(")", "") + + res = np.loadtxt(StringIO(s), dtype=dtype, delimiter=",") + expected = np.array( + [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype + ) + assert_equal(res, expected) + + +def test_read_from_generator(): + def gen(): + for i in range(4): + yield f"{i},{2*i},{i**2}" + + res = np.loadtxt(gen(), dtype=int, delimiter=",") + expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) + assert_equal(res, expected) + + +def test_read_from_generator_multitype(): + def gen(): + for i in range(3): + yield f"{i} {i / 4}" + + res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") + expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") + assert_equal(res, expected) + + +def test_read_from_bad_generator(): + def gen(): + for entry in ["1,2", b"3, 5", 12738]: + yield entry + + with pytest.raises( + TypeError, match=r"non-string returned while reading data"): + np.loadtxt(gen(), dtype="i, i", delimiter=",") + + +@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") +def test_object_cleanup_on_read_error(): + sentinel = object() + already_read = 0 + + def conv(x): + nonlocal already_read + if already_read > 4999: + raise ValueError("failed half-way through!") + already_read += 1 + return sentinel + + txt = StringIO("x\n" * 10000) + + with pytest.raises(ValueError, match="at row 5000, column 1"): + np.loadtxt(txt, dtype=object, converters={0: conv}) + + assert sys.getrefcount(sentinel) == 2 + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_character_not_bytes_compatible(): + """Test exception when a character cannot be encoded as 'S'.""" + data = StringIO("–") # == \u2013 + with pytest.raises(ValueError): + np.loadtxt(data, dtype="S5") + + +@pytest.mark.parametrize("conv", (0, [float], "")) +def test_invalid_converter(conv): + msg = ( + "converters must be a dictionary mapping columns to converter " + "functions or a single callable." + ) + with pytest.raises(TypeError, match=msg): + np.loadtxt(StringIO("1 2\n3 4"), converters=conv) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_converters_dict_raises_non_integer_key(): + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}) + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}, usecols=0) + + +@pytest.mark.parametrize("bad_col_ind", (3, -3)) +def test_converters_dict_raises_non_col_key(bad_col_ind): + data = StringIO("1 2\n3 4") + with pytest.raises(ValueError, match="converter specified for column"): + np.loadtxt(data, converters={bad_col_ind: int}) + + +def test_converters_dict_raises_val_not_callable(): + with pytest.raises(TypeError, + match="values of the converters dictionary must be callable"): + np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) + + +@pytest.mark.parametrize("q", ('"', "'", "`")) +def test_quoted_field(q): + txt = StringIO( + f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" + ) + dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) + expected = np.array( + [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype + ) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) + assert_array_equal(res, expected) + + +def test_quote_support_default(): + """Support for quoted fields is disabled by default.""" + txt = StringIO('"lat,long", 45, 30\n') + dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) + + with pytest.raises(ValueError, match="the number of columns changed"): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + # Enable quoting support with non-None value for quotechar param + txt.seek(0) + expected = np.array([("lat,long", 45., 30.)], dtype=dtype) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, expected) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_quotechar_multichar_error(): + txt = StringIO("1,2\n3,4") + msg = r".*must be a single unicode character or None" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=",", quotechar="''") + + +def test_comment_multichar_error_with_quote(): + txt = StringIO("1,2\n3,4") + msg = ( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported." + ) + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') + + # A single character string in a tuple is unpacked though: + res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") + assert_equal(res, [[1, 2], [3, 4]]) + + +def test_structured_dtype_with_quotes(): + data = StringIO( + ( + "1000;2.4;'alpha';-34\n" + "2000;3.1;'beta';29\n" + "3500;9.9;'gamma';120\n" + "4090;8.1;'delta';0\n" + "5001;4.4;'epsilon';-99\n" + "6543;7.8;'omega';-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") + assert_array_equal(res, expected) + + +def test_quoted_field_is_not_empty(): + txt = StringIO('1\n\n"4"\n""') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_quoted_field_is_not_empty_nonstrict(): + # Same as test_quoted_field_is_not_empty but check that we are not strict + # about missing closing quote (this is the `csv.reader` default also) + txt = StringIO('1\n\n"4"\n"') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_consecutive_quotechar_escaped(): + txt = StringIO('"Hello, my name is ""Monty""!"') + expected = np.array('Hello, my name is "Monty"!', dtype="U40") + res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') + assert_equal(res, expected) + + +@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) +@pytest.mark.parametrize("ndmin", (0, 1, 2)) +@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) +def test_warn_on_no_data(data, ndmin, usecols): + """Check that a UserWarning is emitted when no data is read from input.""" + if usecols is not None: + expected_shape = (0, 3) + elif ndmin == 2: + expected_shape = (0, 1) # guess a single column?! + else: + expected_shape = (0,) + + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + + with NamedTemporaryFile(mode="w") as fh: + fh.write(data) + fh.seek(0) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + +@pytest.mark.parametrize("skiprows", (2, 3)) +def test_warn_on_skipped_data(skiprows): + data = "1 2 3\n4 5 6" + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, skiprows=skiprows) + + +@pytest.mark.parametrize(["dtype", "value"], [ + ("i2", 0x0001), ("u2", 0x0001), + ("i4", 0x00010203), ("u4", 0x00010203), + ("i8", 0x0001020304050607), ("u8", 0x0001020304050607), + # The following values are constructed to lead to unique bytes: + ("float16", 3.07e-05), + ("float32", 9.2557e-41), ("complex64", 9.2557e-41+2.8622554e-29j), + ("float64", -1.758571353180402e-24), + # Here and below, the repr side-steps a small loss of precision in + # complex `str` in PyPy (which is probably fine, as repr works): + ("complex128", repr(5.406409232372729e-29-1.758571353180402e-24j)), + # Use integer values that fit into double. Everything else leads to + # problems due to longdoubles going via double and decimal strings + # causing rounding errors. + ("longdouble", 0x01020304050607), + ("clongdouble", repr(0x01020304050607 + (0x00121314151617 * 1j))), + ("U2", "\U00010203\U000a0b0c")]) +@pytest.mark.parametrize("swap", [True, False]) +def test_byteswapping_and_unaligned(dtype, value, swap): + # Try to create "interesting" values within the valid unicode range: + dtype = np.dtype(dtype) + data = [f"x,{value}\n"] # repr as PyPy `str` truncates some + if swap: + dtype = dtype.newbyteorder() + full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) + # The above ensures that the interesting "b" field is unaligned: + assert full_dt.fields["b"][1] == 1 + res = np.loadtxt(data, dtype=full_dt, delimiter=",", encoding=None, + max_rows=1) # max-rows prevents over-allocation + assert res["b"] == dtype.type(value) + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efdFD" + "?") +def test_unicode_whitespace_stripping(dtype): + # Test that all numeric types (and bool) strip whitespace correctly + # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. + # Currently, skip float128 as it did not always support this and has no + # "custom" parsing: + txt = StringIO(' 3 ,"\u202F2\n"') + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, np.array([3, 2]).astype(dtype)) + + +@pytest.mark.parametrize("dtype", "FD") +def test_unicode_whitespace_stripping_complex(dtype): + # Complex has a few extra cases since it has two components and + # parentheses + line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" + data = [line, line.replace(" ", "\u202F")] + res = np.loadtxt(data, dtype=dtype, delimiter=',') + assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", "FD") +@pytest.mark.parametrize("field", + ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) +def test_bad_complex(dtype, field): + with pytest.raises(ValueError): + np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_nul_character_error(dtype): + # Test that a \0 character is correctly recognized as an error even if + # what comes before is valid (not everything gets parsed internally). + if dtype.lower() == "g": + pytest.xfail("longdouble/clongdouble assignment may misbehave.") + with pytest.raises(ValueError): + np.loadtxt(["1\000"], dtype=dtype, delimiter=",", quotechar='"') + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_no_thousands_support(dtype): + # Mainly to document behaviour, Python supports thousands like 1_1. + # (e and G may end up using different conversion and support it, this is + # a bug but happens...) + if dtype == "e": + pytest.skip("half assignment currently uses Python float converter") + if dtype in "eG": + pytest.xfail("clongdouble assignment is buggy (uses `complex`?).") + + assert int("1_1") == float("1_1") == complex("1_1") == 11 + with pytest.raises(ValueError): + np.loadtxt(["1_1\n"], dtype=dtype) + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2\n,3\n"], + ["1,2\n", "2\r,3\n"]]) +def test_bad_newline_in_iterator(data): + # In NumPy <=1.22 this was accepted, because newlines were completely + # ignored when the input was an iterable. This could be changed, but right + # now, we raise an error. + msg = "Found an unquoted embedded newline within a single line" + with pytest.raises(ValueError, match=msg): + np.loadtxt(data, delimiter=",") + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2,3\r\n"], # a universal newline + ["1,2\n", "'2\n',3\n"], # a quoted newline + ["1,2\n", "'2\r',3\n"], + ["1,2\n", "'2\r\n',3\n"], +]) +def test_good_newline_in_iterator(data): + # The quoted newlines will be untransformed here, but are just whitespace. + res = np.loadtxt(data, delimiter=",", quotechar="'") + assert_array_equal(res, [[1., 2.], [2., 3.]]) + + +@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) +def test_universal_newlines_quoted(newline): + # Check that universal newline support within the tokenizer is not applied + # to quoted fields. (note that lines must end in newline or quoted + # fields will not include a newline at all) + data = ['1,"2\n"\n', '3,"4\n', '1"\n'] + data = [row.replace("\n", newline) for row in data] + res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') + assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) + + +def test_null_character(): + # Basic tests to check that the NUL character is not special: + res = np.loadtxt(["1\0002\0003\n", "4\0005\0006"], delimiter="\000") + assert_array_equal(res, [[1, 2, 3], [4, 5, 6]]) + + # Also not as part of a field (avoid unicode/arrays as unicode strips \0) + res = np.loadtxt(["1\000,2\000,3\n", "4\000,5\000,6"], + delimiter=",", dtype=object) + assert res.tolist() == [["1\000", "2\000", "3"], ["4\000", "5\000", "6"]] + + +def test_iterator_fails_getting_next_line(): + class BadSequence: + def __len__(self): + return 100 + + def __getitem__(self, item): + if item == 50: + raise RuntimeError("Bad things happened!") + return f"{item}, {item+1}" + + with pytest.raises(RuntimeError, match="Bad things happened!"): + np.loadtxt(BadSequence(), dtype=int, delimiter=",") + + +class TestCReaderUnitTests: + # These are internal tests for path that should not be possible to hit + # unless things go very very wrong somewhere. + def test_not_an_filelike(self): + with pytest.raises(AttributeError, match=".*read"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_read_fails(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + + def read(self, size): + self.counter += 1 + if self.counter > 20: + raise RuntimeError("Bad bad bad!") + return "1,2,3\n" + + with pytest.raises(RuntimeError, match="Bad bad bad!"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_bad_read(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + + class BadFileLike: + counter = 0 + + def read(self, size): + return 1234 # not a string! + + with pytest.raises(TypeError, + match="non-string returned while reading data"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_not_an_iter(self): + with pytest.raises(TypeError, + match="error reading from object, expected an iterable"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False) + + def test_bad_type(self): + with pytest.raises(TypeError, match="internal error: dtype must"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype="i", filelike=False) + + def test_bad_encoding(self): + with pytest.raises(TypeError, match="encoding must be a unicode"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False, encoding=123) + + @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) + def test_manual_universal_newlines(self, newline): + # This is currently not available to users, because we should always + # open files with universal newlines enabled `newlines=None`. + # (And reading from an iterator uses slightly different code paths.) + # We have no real support for `newline="\r"` or `newline="\n" as the + # user cannot specify those options. + data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), + newline="") + + res = np.core._multiarray_umath._load_from_filelike( + data, dtype=np.dtype("U10"), filelike=True, + quote='"', comment="#", skiplines=1) + assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) + + +def test_delimiter_comment_collision_raises(): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",") + + +def test_delimiter_quotechar_collision_raises(): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",") + + +def test_comment_quotechar_collision_raises(): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#") + + +def test_delimiter_and_multiple_comments_collision_raises(): + with pytest.raises( + TypeError, match="Comment characters.*cannot include the delimiter" + ): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=["#", ","]) + + +@pytest.mark.parametrize( + "ws", + ( + " ", # space + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_collision_with_default_delimiter_raises(ws): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws) + with pytest.raises(TypeError, match=".*control characters.*incompatible"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws) + + +@pytest.mark.parametrize("nl", ("\n", "\r")) +def test_control_character_newline_raises(nl): + txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}") + msg = "control character.*cannot be a newline" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=nl) + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, comments=nl) + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, quotechar=nl) + + +@pytest.mark.parametrize( + ("generic_data", "long_datum", "unitless_dtype", "expected_dtype"), + [ + ("2012-03", "2013-01-15", "M8", "M8[D]"), # Datetimes + ("spam-a-lot", "tis_but_a_scratch", "U", "U17"), # str + ], +) +@pytest.mark.parametrize("nrows", (10, 50000, 60000)) # lt, eq, gt chunksize +def test_parametric_unit_discovery( + generic_data, long_datum, unitless_dtype, expected_dtype, nrows +): + """Check that the correct unit (e.g. month, day, second) is discovered from + the data when a user specifies a unitless datetime.""" + # Unit should be "D" (days) due to last entry + data = [generic_data] * 50000 + [long_datum] + expected = np.array(data, dtype=expected_dtype) + + # file-like path + txt = StringIO("\n".join(data)) + a = np.loadtxt(txt, dtype=unitless_dtype) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + # file-obj path + fd, fname = mkstemp() + with open(fname, "w") as fh: + fh.write("\n".join(data)) + a = np.loadtxt(fname, dtype=unitless_dtype) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + +def test_str_dtype_unit_discovery_with_converter(): + data = ["spam-a-lot"] * 60000 + ["XXXtis_but_a_scratch"] + expected = np.array( + ["spam-a-lot"] * 60000 + ["tis_but_a_scratch"], dtype="U17" + ) + conv = lambda s: s.strip("XXX") + + # file-like path + txt = StringIO("\n".join(data)) + a = np.loadtxt(txt, dtype="U", converters=conv, encoding=None) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + # file-obj path + fd, fname = mkstemp() + with open(fname, "w") as fh: + fh.write("\n".join(data)) + a = np.loadtxt(fname, dtype="U", converters=conv, encoding=None) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") +def test_control_character_empty(): + with pytest.raises(TypeError, match="Text reading control character must"): + np.loadtxt(StringIO("1 2 3"), delimiter="") + with pytest.raises(TypeError, match="Text reading control character must"): + np.loadtxt(StringIO("1 2 3"), quotechar="") + with pytest.raises(ValueError, match="comments cannot be an empty string"): + np.loadtxt(StringIO("1 2 3"), comments="") + with pytest.raises(ValueError, match="comments cannot be an empty string"): + np.loadtxt(StringIO("1 2 3"), comments=["#", ""]) + + +def test_control_characters_as_bytes(): + """Byte control characters (comments, delimiter) are supported.""" + a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",") + assert_equal(a, [1, 2, 3]) diff --git a/numpy/lib/utils.py b/numpy/lib/utils.py index c74ee127d..e8f4952d3 100644 --- a/numpy/lib/utils.py +++ b/numpy/lib/utils.py @@ -25,8 +25,7 @@ def get_include(): Notes ----- - When using ``distutils``, for example in ``setup.py``. - :: + When using ``distutils``, for example in ``setup.py``:: import numpy as np ... |