diff options
author | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-14 19:04:16 -0600 |
---|---|---|
committer | Sebastian Berg <sebastian@sipsolutions.net> | 2022-01-14 20:07:07 -0600 |
commit | 90c71f0a8a84d9f17243e28e01527b5fd1ecdbb9 (patch) | |
tree | 311c223111667426615e083d08618f2bf91985e2 /numpy/lib/tests | |
parent | 0cb6bdcf2a28e8a3a74a302d0807cd054a15925f (diff) | |
download | numpy-90c71f0a8a84d9f17243e28e01527b5fd1ecdbb9.tar.gz |
TST: Move most new loadtxt tests to its own file
This also adds two basic new tests around files/strings containing
the \0 character (prooving that we handle that gracefully).
Also adds tests for:
* the `_` thousands delimiter (should fail, but doesn't for float128
right now)
* Failure modes when the number of rows changes (negative specifically)
Many of these tests came originally from Warren Weckesser and others
were added by Ross Barnowsky:
Co-authored-by: Warren Weckesser <warren.weckesser@gmail.com>
Co-authored-by: Ross Barnowski <rossbar@berkeley.edu>
Diffstat (limited to 'numpy/lib/tests')
-rw-r--r-- | numpy/lib/tests/test_io.py | 767 | ||||
-rw-r--r-- | numpy/lib/tests/test_loadtxt.py | 836 |
2 files changed, 836 insertions, 767 deletions
diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 5f66e0b6a..f142972b2 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -2722,770 +2722,3 @@ def test_load_refcount(): with assert_no_gc_cycles(): x = np.loadtxt(TextIO("0 1 2 3"), dtype=dt) assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt)) - - -def test_loadtxt_scientific_notation(): - """Test that both 'e' and 'E' are parsed correctly.""" - data = TextIO( - ( - "1.0e-1,2.0E1,3.0\n" - "4.0e-2,5.0E-1,6.0\n" - "7.0e-3,8.0E1,9.0\n" - "0.0e-4,1.0E-1,2.0" - ) - ) - expected = np.array( - [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] - ) - assert_array_equal(np.loadtxt(data, delimiter=","), expected) - - -@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) -def test_loadtxt_comment_multiple_chars(comment): - content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" - txt = TextIO(content.replace("#", comment)) - a = np.loadtxt(txt, delimiter=",", comments=comment) - assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) - - -@pytest.fixture -def mixed_types_structured(): - """ - Fixture providing hetergeneous input data with a structured dtype, along - with the associated structured array. - """ - data = TextIO( - ( - "1000;2.4;alpha;-34\n" - "2000;3.1;beta;29\n" - "3500;9.9;gamma;120\n" - "4090;8.1;delta;0\n" - "5001;4.4;epsilon;-99\n" - "6543;7.8;omega;-1\n" - ) - ) - dtype = np.dtype( - [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] - ) - expected = np.array( - [ - (1000, 2.4, "alpha", -34), - (2000, 3.1, "beta", 29), - (3500, 9.9, "gamma", 120), - (4090, 8.1, "delta", 0), - (5001, 4.4, "epsilon", -99), - (6543, 7.8, "omega", -1) - ], - dtype=dtype - ) - return data, dtype, expected - - -@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) -def test_loadtxt_structured_dtype_and_skiprows_no_empty_lines( - skiprows, mixed_types_structured - ): - data, dtype, expected = mixed_types_structured - a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) - assert_array_equal(a, expected[skiprows:]) - - -def test_loadtxt_unpack_structured(mixed_types_structured): - data, dtype, expected = mixed_types_structured - - a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) - assert_array_equal(a, expected["f0"]) - assert_array_equal(b, expected["f1"]) - assert_array_equal(c, expected["f2"]) - assert_array_equal(d, expected["f3"]) - - -def test_loadtxt_structured_dtype_with_shape(): - dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) - data = TextIO("0,1,2,3\n6,7,8,9\n") - expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) - assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) - - -def test_loadtxt_structured_dtype_with_multi_shape(): - dtype = np.dtype([("a", "u1", (2, 2))]) - data = TextIO("0 1 2 3\n") - expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) - assert_array_equal(np.loadtxt(data, dtype=dtype), expected) - - -def test_loadtxt_nested_structured_subarray(): - # Test from gh-16678 - point = np.dtype([('x', float), ('y', float)]) - dt = np.dtype([('code', int), ('points', point, (2,))]) - data = TextIO("100,1,2,3,4\n200,5,6,7,8\n") - expected = np.array( - [ - (100, [(1., 2.), (3., 4.)]), - (200, [(5., 6.), (7., 8.)]), - ], - dtype=dt - ) - assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) - - -def test_loadtxt_structured_dtype_offsets(): - # An aligned structured dtype will have additional padding - dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) - data = TextIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") - expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) - assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) - - -@pytest.mark.parametrize("param", ("skiprows", "max_rows")) -def test_loadtxt_exception_negative_row_limits(param): - """skiprows and max_rows should raise for negative parameters.""" - with pytest.raises(ValueError, match="argument must be nonnegative"): - np.loadtxt("foo.bar", **{param: -3}) - - -@pytest.mark.parametrize("param", ("skiprows", "max_rows")) -def test_loadtxt_exception_noninteger_row_limits(param): - with pytest.raises(TypeError, match="argument must be an integer"): - np.loadtxt("foo.bar", **{param: 1.0}) - - -@pytest.mark.parametrize( - "data, shape", - [ - ("1 2 3 4 5\n", (1, 5)), # Single row - ("1\n2\n3\n4\n5\n", (5, 1)), # Single column - ] -) -def test_loadtxt_ndmin_single_row_or_col(data, shape): - arr = np.array([1, 2, 3, 4, 5]) - arr2d = arr.reshape(shape) - - assert_array_equal(np.loadtxt(TextIO(data), dtype=int), arr) - assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=0), arr) - assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=1), arr) - assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=2), arr2d) - - -@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) -def test_loadtxt_bad_ndmin(badval): - with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): - np.loadtxt("foo.bar", ndmin=badval) - - -@pytest.mark.parametrize( - "ws", - ( - "\t", # tab - "\u2003", # em - "\u00A0", # non-break - "\u3000", # ideographic space - ) -) -def test_loadtxt_blank_lines_spaces_delimit(ws): - txt = StringIO( - f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1" - ) - # NOTE: It is unclear that the ` # comment` should succeed. Except - # for delimiter=None, which should use any whitespace (and maybe - # should just be implemented closer to Python - expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) - assert_equal( - np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected - ) - - -def test_loadtxt_blank_lines_normal_delimiter(): - txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') - expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) - assert_equal( - np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected - ) - - -@pytest.mark.parametrize("dtype", (float, object)) -def test_loadtxt_maxrows_no_blank_lines(dtype): - txt = TextIO("1.5,2.5\n3.0,4.0\n5.5,6.0") - res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) - assert_equal(res.dtype, dtype) - assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) - - -@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) -def test_loadtxt_exception_message_bad_values(dtype): - txt = TextIO("1,2\n3,XXX\n5,6") - msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" - with pytest.raises(ValueError, match=msg): - np.loadtxt(txt, dtype=dtype, delimiter=",") - - -def test_loadtxt_converters_negative_indices(): - txt = TextIO('1.5,2.5\n3.0,XXX\n5.5,6.0') - conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} - expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) - res = np.loadtxt( - txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None - ) - assert_equal(res, expected) - - -def test_loadtxt_converters_negative_indices_with_usecols(): - txt = TextIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') - conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} - expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) - res = np.loadtxt( - txt, - dtype=np.float64, - delimiter=",", - converters=conv, - usecols=[0, -1], - encoding=None, - ) - assert_equal(res, expected) - - -def test_loadtxt_ragged_usecols(): - # usecols, and negative ones, work even with varying number of columns. - txt = TextIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") - expected = np.array([[0, 0], [0, 0], [0, 0]]) - res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) - assert_equal(res, expected) - - -def test_loadtxt_empty_usecols(): - txt = TextIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") - res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) - assert res.shape == (3,) - assert res.dtype == np.dtype([]) - - -@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) -@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) -def test_loadtxt_large_unicode_characters(c1, c2): - # c1 and c2 span ascii, 16bit and 32bit range. - txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") - res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") - expected = np.array( - [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], - dtype=np.dtype('U12') - ) - assert_equal(res, expected) - - -def test_loadtxt_unicode_with_converter(): - txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") - conv = {0: lambda s: s.upper()} - res = np.loadtxt( - txt, - dtype=np.dtype("U12"), - converters=conv, - delimiter=",", - encoding=None - ) - expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) - assert_equal(res, expected) - - -def test_loadtxt_converter_with_structured_dtype(): - txt = TextIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') - dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) - conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} - res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) - expected = np.array( - [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt - ) - assert_equal(res, expected) - - -def test_loadtxt_converter_with_unicode_dtype(): - """ - With the default 'bytes' encoding, tokens are encoded prior to being passed - to the converter. This means that the output of the converter may be bytes - instead of unicode as expected by `read_rows`. - - This test checks that outputs from the above scenario are properly decoded - prior to parsing by `read_rows`. - """ - txt = StringIO('abc,def\nrst,xyz') - conv = bytes.upper - res = np.loadtxt(txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") - expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) - assert_equal(res, expected) - - -def test_loadtxt_read_huge_row(): - row = "1.5, 2.5," * 50000 - row = row[:-1] + "\n" - txt = TextIO(row * 2) - res = np.loadtxt(txt, delimiter=",", dtype=float) - assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) - - -@pytest.mark.parametrize("dtype", "edfgFDG") -def test_loadtxt_huge_float(dtype): - # Covers a non-optimized path that is rarely taken: - field = "0" * 1000 + ".123456789" - dtype = np.dtype(dtype) - value = np.loadtxt([field], dtype=dtype)[()] - assert value == dtype.type("0.123456789") - - -@pytest.mark.parametrize( - ("given_dtype", "expected_dtype"), - [ - ("S", np.dtype("S5")), - ("U", np.dtype("U5")), - ], -) -def test_loadtxt_string_no_length_given(given_dtype, expected_dtype): - """ - The given dtype is just 'S' or 'U' with no length. In these cases, the - length of the resulting dtype is determined by the longest string found - in the file. - """ - txt = TextIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") - res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") - expected = np.array( - [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype - ) - assert_equal(res, expected) - assert_equal(res.dtype, expected_dtype) - - -def test_loadtxt_float_conversion(): - """ - Some tests that the conversion to float64 works as accurately as the Python - built-in `float` function. In a naive version of the float parser, these - strings resulted in values that were off by an ULP or two. - """ - strings = [ - '0.9999999999999999', - '9876543210.123456', - '5.43215432154321e+300', - '0.901', - '0.333', - ] - txt = TextIO('\n'.join(strings)) - res = np.loadtxt(txt) - expected = np.array([float(s) for s in strings]) - assert_equal(res, expected) - - -def test_loadtxt_bool(): - # Simple test for bool via integer - txt = TextIO("1, 0\n10, -1") - res = np.loadtxt(txt, dtype=bool, delimiter=",") - assert res.dtype == bool - assert_array_equal(res, [[True, False], [True, True]]) - # Make sure we use only 1 and 0 on the byte level: - assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) - - -@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) -def test_loadtxt_integer_signs(dtype): - dtype = np.dtype(dtype) - assert np.loadtxt(["+2"], dtype=dtype) == 2 - if dtype.kind == "u": - with pytest.raises(ValueError): - np.loadtxt(["-1\n"], dtype=dtype) - else: - assert np.loadtxt(["-2\n"], dtype=dtype) == -2 - - for sign in ["++", "+-", "--", "-+"]: - with pytest.raises(ValueError): - np.loadtxt([f"{sign}2\n"], dtype=dtype) - - -@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) -def test_loadtxt_implicit_cast_float_to_int_fails(dtype): - txt = TextIO("1.0, 2.1, 3.7\n4, 5, 6") - with pytest.raises(ValueError): - np.loadtxt(txt, dtype=dtype, delimiter=",") - -@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) -@pytest.mark.parametrize("with_parens", (False, True)) -def test_loadtxt_complex_parsing(dtype, with_parens): - s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" - if not with_parens: - s = s.replace("(", "").replace(")", "") - - res = np.loadtxt(TextIO(s), dtype=dtype, delimiter=",") - expected = np.array( - [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype - ) - assert_equal(res, expected) - - -def test_loadtxt_read_from_generator(): - def gen(): - for i in range(4): - yield f"{i},{2*i},{i**2}" - - res = np.loadtxt(gen(), dtype=int, delimiter=",") - expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) - assert_equal(res, expected) - - -def test_loadtxt_read_from_generator_multitype(): - def gen(): - for i in range(3): - yield f"{i} {i / 4}" - - res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") - expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") - assert_equal(res, expected) - - -def test_loadtxt_read_from_bad_generator(): - def gen(): - for entry in ["1,2", b"3, 5", 12738]: - yield entry - - with pytest.raises( - TypeError, match=r"non-string returned while reading data" - ): - np.loadtxt(gen(), dtype="i, i", delimiter=",") - - -@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") -def test_loadtxt_object_cleanup_on_read_error(): - sentinel = object() - - already_read = 0 - def conv(x): - nonlocal already_read - if already_read > 4999: - raise ValueError("failed half-way through!") - already_read += 1 - return sentinel - - txt = TextIO("x\n" * 10000) - - with pytest.raises(ValueError, match="at row 5000, column 1"): - np.loadtxt(txt, dtype=object, converters={0: conv}) - - assert sys.getrefcount(sentinel) == 2 - - -def test_loadtxt_character_not_bytes_compatible(): - """Test exception when a character cannot be encoded as 'S'.""" - data = StringIO("–") # == \u2013 - with pytest.raises(ValueError): - np.loadtxt(data, dtype="S5") - - -@pytest.mark.parametrize("conv", (0, [float], "")) -def test_loadtxt_invalid_converter(conv): - msg = ( - "converters must be a dictionary mapping columns to converter " - "functions or a single callable." - ) - with pytest.raises(TypeError, match=msg): - np.loadtxt(TextIO("1 2\n3 4"), converters=conv) - - -def test_loadtxt_converters_dict_raises_non_integer_key(): - with pytest.raises(TypeError, match="keys of the converters dict"): - np.loadtxt(TextIO("1 2\n3 4"), converters={"a": int}) - with pytest.raises(TypeError, match="keys of the converters dict"): - np.loadtxt(TextIO("1 2\n3 4"), converters={"a": int}, usecols=0) - - -@pytest.mark.parametrize("bad_col_ind", (3, -3)) -def test_loadtxt_converters_dict_raises_non_col_key(bad_col_ind): - data = TextIO("1 2\n3 4") - with pytest.raises(ValueError, match="converter specified for column"): - np.loadtxt(data, converters={bad_col_ind: int}) - - -def test_loadtxt_converters_dict_raises_val_not_callable(): - with pytest.raises( - TypeError, match="values of the converters dictionary must be callable" - ): - np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) - - -@pytest.mark.parametrize("q", ('"', "'", "`")) -def test_loadtxt_quoted_field(q): - txt = TextIO( - f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" - ) - dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) - expected = np.array( - [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype - ) - - res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) - assert_array_equal(res, expected) - - -def test_loadtxt_quote_support_default(): - """Support for quoted fields is disabled by default.""" - txt = TextIO('"lat,long", 45, 30\n') - dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) - - with pytest.raises(ValueError, match="the number of columns changed"): - np.loadtxt(txt, dtype=dtype, delimiter=",") - - # Enable quoting support with non-None value for quotechar param - txt.seek(0) - expected = np.array([("lat,long", 45., 30.)], dtype=dtype) - - res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') - assert_array_equal(res, expected) - - -def test_loadtxt_quotechar_multichar_error(): - txt = StringIO("1,2\n3,4") - msg = r".*must be a single unicode character or None" - with pytest.raises(TypeError, match=msg): - np.loadtxt(txt, delimiter=",", quotechar="''") - - -def test_loadtxt_comment_multichar_error_with_quote(): - txt = StringIO("1,2\n3,4") - msg = ( - "when multiple comments or a multi-character comment is given, " - "quotes are not supported." - ) - with pytest.raises(ValueError, match=msg): - np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') - with pytest.raises(ValueError, match=msg): - np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') - - # A single character string in a tuple is unpacked though: - res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") - assert_equal(res, [[1, 2], [3, 4]]) - - -def test_loadtxt_structured_dtype_with_quotes(): - data = TextIO( - ( - "1000;2.4;'alpha';-34\n" - "2000;3.1;'beta';29\n" - "3500;9.9;'gamma';120\n" - "4090;8.1;'delta';0\n" - "5001;4.4;'epsilon';-99\n" - "6543;7.8;'omega';-1\n" - ) - ) - dtype = np.dtype( - [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] - ) - expected = np.array( - [ - (1000, 2.4, "alpha", -34), - (2000, 3.1, "beta", 29), - (3500, 9.9, "gamma", 120), - (4090, 8.1, "delta", 0), - (5001, 4.4, "epsilon", -99), - (6543, 7.8, "omega", -1) - ], - dtype=dtype - ) - res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") - assert_array_equal(res, expected) - - -def test_loadtxt_quoted_field_is_not_empty(): - txt = StringIO('1\n\n"4"\n""') - expected = np.array(["1", "4", ""], dtype="U1") - res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') - assert_equal(res, expected) - - -def test_loadtxt_consecutive_quotechar_escaped(): - txt = TextIO('"Hello, my name is ""Monty""!"') - expected = np.array('Hello, my name is "Monty"!', dtype="U40") - res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') - assert_equal(res, expected) - - -@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) -@pytest.mark.parametrize("ndmin", (0, 1, 2)) -@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) -def test_loadtxt_warn_on_no_data(data, ndmin, usecols): - """Check that a UserWarning is emitted when no data is read from input.""" - if usecols is not None: - expected_shape = (0, 3) - elif ndmin == 2: - expected_shape = (0, 1) # guess a single column?! - else: - expected_shape = (0,) - - txt = TextIO(data) - with pytest.warns(UserWarning, match="input contained no data"): - res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) - assert res.shape == expected_shape - - with NamedTemporaryFile(mode="w") as fh: - fh.write(data) - fh.seek(0) - with pytest.warns(UserWarning, match="input contained no data"): - res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) - assert res.shape == expected_shape - -@pytest.mark.parametrize("skiprows", (2, 3)) -def test_loadtxt_warn_on_skipped_data(skiprows): - data = "1 2 3\n4 5 6" - txt = TextIO(data) - with pytest.warns(UserWarning, match="input contained no data"): - np.loadtxt(txt, skiprows=skiprows) - -@pytest.mark.parametrize("dtype", - list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) -@pytest.mark.parametrize("swap", [True, False]) -def test_loadtxt_byteswapping_and_unaligned(dtype, swap): - data = ["x,1\n"] # no need for complicated data - dtype = np.dtype(dtype) - if swap: - dtype = dtype.newbyteorder() - full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) - # The above ensures that the interesting "b" field is unaligned: - assert full_dt.fields["b"][1] == 1 - res = np.loadtxt(data, dtype=full_dt, delimiter=",") - assert res["b"] == dtype.type(1) - -@pytest.mark.parametrize("dtype", - np.typecodes["AllInteger"] + "efdFD" + "?") -def test_loadtxt_unicode_whitespace_stripping(dtype): - # Test that all numeric types (and bool) strip whitespace correctly - # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. - # Currently, skip float128 as it did not always support this and has no - # "custom" parsing: - txt = StringIO(' 3 ,"\u202F2\n"') - res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') - assert_array_equal(res, np.array([3, 2]).astype(dtype)) - -@pytest.mark.parametrize("dtype", "FD") -def test_loadtxt_unicode_whitespace_stripping_complex(dtype): - # Complex has a few extra cases since it has two components and parentheses - line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" - data = [line, line.replace(" ", "\u202F")] - res = np.loadtxt(data, dtype=dtype, delimiter=',') - assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) - -@pytest.mark.parametrize("dtype", "FD") -@pytest.mark.parametrize("field", - ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) -def test_loadtxt_bad_complex(dtype, field): - with pytest.raises(ValueError): - np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") - - -@pytest.mark.parametrize("data", [ - ["1,2\n", "2\n,3\n"], - ["1,2\n", "2\r,3\n"]]) -def test_loadtxt_bad_newline_in_iterator(data): - # In NumPy <=1.22 this was accepted, because newlines were completely - # ignored when the input was an iterable. This could be changed, but right - # now, we raise an error. - with pytest.raises(ValueError, - match="Found an unquoted embedded newline within a single line"): - np.loadtxt(data, delimiter=",") - -@pytest.mark.parametrize("data", [ - ["1,2\n", "2,3\r\n"], # a universal newline - ["1,2\n", "'2\n',3\n"], # a quoted newline - ["1,2\n", "'2\r',3\n"], - ["1,2\n", "'2\r\n',3\n"], -]) -def test_loadtxt_good_newline_in_iterator(data): - # The quoted newlines will be untransformed here, but are just whitespace. - res = np.loadtxt(data, delimiter=",", quotechar="'") - assert_array_equal(res, [[1., 2.], [2., 3.]]) - - -@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) -def test_unviersal_newlines_quoted(newline): - # Check that universal newline support within the tokenizer is not applied - # to quoted fields. (note that lines must end in newline or quoted - # fields will not include a newline at all) - data = ['1,"2\n"\n', '3,"4\n', '1"\n'] - data = [row.replace("\n", newline) for row in data] - res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') - assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) - - -def test_loadtxt_iterator_fails_getting_next_line(): - class BadSequence: - def __len__(self): - return 100 - - def __getitem__(self, item): - if item == 50: - raise RuntimeError("Bad things happened!") - return f"{item}, {item+1}" - - with pytest.raises(RuntimeError, match="Bad things happened!"): - np.loadtxt(BadSequence(), dtype=int, delimiter=",") - - -class TestCReaderUnitTests: - # These are internal tests for path that should not be possible to hit - # unless things go very very wrong somewhere. - def test_not_an_filelike(self): - with pytest.raises(AttributeError, match=".*read"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype=np.dtype("i"), filelike=True) - - def test_filelike_read_fails(self): - # Can only be reached if loadtxt opens the file, so it is hard to do - # via the public interface (although maybe not impossible considering - # the current "DataClass" backing). - class BadFileLike: - counter = 0 - def read(self, size): - self.counter += 1 - if self.counter > 20: - raise RuntimeError("Bad bad bad!") - return "1,2,3\n" - - with pytest.raises(RuntimeError, match="Bad bad bad!"): - np.core._multiarray_umath._load_from_filelike( - BadFileLike(), dtype=np.dtype("i"), filelike=True) - - def test_filelike_bad_read(self): - # Can only be reached if loadtxt opens the file, so it is hard to do - # via the public interface (although maybe not impossible considering - # the current "DataClass" backing). - class BadFileLike: - counter = 0 - def read(self, size): - return 1234 # not a string! - - with pytest.raises(TypeError, - match="non-string returned while reading data"): - np.core._multiarray_umath._load_from_filelike( - BadFileLike(), dtype=np.dtype("i"), filelike=True) - - def test_not_an_iter(self): - with pytest.raises(TypeError, - match="error reading from object, expected an iterable"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype=np.dtype("i"), filelike=False) - - def test_bad_type(self): - with pytest.raises(TypeError, match="internal error: dtype must"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype="i", filelike=False) - - def test_bad_encoding(self): - with pytest.raises(TypeError, match="encoding must be a unicode"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype=np.dtype("i"), filelike=False, encoding=123) - - @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) - def test_manual_universal_newlines(self, newline): - # This is currently not available to users, because we should always - # open files with universal newlines enabled `newlines=None`. - # (And reading from an iterator uses slightly different code paths.) - # We have no real support for `newline="\r"` or `newline="\n" as the - # user cannot specify those options. - data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), - newline="") - - res = np.core._multiarray_umath._load_from_filelike( - data, dtype=np.dtype("U10"), filelike=True, - quote='"', comment="#", skiplines=1) - assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py new file mode 100644 index 000000000..b8fd9a796 --- /dev/null +++ b/numpy/lib/tests/test_loadtxt.py @@ -0,0 +1,836 @@ +""" +Tests specific to `np.loadtxt` added during the move of loadtxt to be backed +by C code. +These tests complement those found in `test_io.py`. +""" + +import sys +import pytest +from tempfile import NamedTemporaryFile +from io import StringIO + +import numpy as np +from numpy.ma.testutils import assert_equal +from numpy.testing import assert_array_equal, HAS_REFCOUNT + + +def test_scientific_notation(): + """Test that both 'e' and 'E' are parsed correctly.""" + data = StringIO( + ( + "1.0e-1,2.0E1,3.0\n" + "4.0e-2,5.0E-1,6.0\n" + "7.0e-3,8.0E1,9.0\n" + "0.0e-4,1.0E-1,2.0" + ) + ) + expected = np.array( + [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] + ) + assert_array_equal(np.loadtxt(data, delimiter=","), expected) + + +@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) +def test_comment_multiple_chars(comment): + content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" + txt = StringIO(content.replace("#", comment)) + a = np.loadtxt(txt, delimiter=",", comments=comment) + assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) + + +@pytest.fixture +def mixed_types_structured(): + """ + Fixture providing hetergeneous input data with a structured dtype, along + with the associated structured array. + """ + data = StringIO( + ( + "1000;2.4;alpha;-34\n" + "2000;3.1;beta;29\n" + "3500;9.9;gamma;120\n" + "4090;8.1;delta;0\n" + "5001;4.4;epsilon;-99\n" + "6543;7.8;omega;-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + return data, dtype, expected + + +@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) +def test_structured_dtype_and_skiprows_no_empty_lines( + skiprows, mixed_types_structured): + data, dtype, expected = mixed_types_structured + a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) + assert_array_equal(a, expected[skiprows:]) + + +def test_unpack_structured(mixed_types_structured): + data, dtype, expected = mixed_types_structured + + a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) + assert_array_equal(a, expected["f0"]) + assert_array_equal(b, expected["f1"]) + assert_array_equal(c, expected["f2"]) + assert_array_equal(d, expected["f3"]) + + +def test_structured_dtype_with_shape(): + dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) + data = StringIO("0,1,2,3\n6,7,8,9\n") + expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) + + +def test_structured_dtype_with_multi_shape(): + dtype = np.dtype([("a", "u1", (2, 2))]) + data = StringIO("0 1 2 3\n") + expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) + assert_array_equal(np.loadtxt(data, dtype=dtype), expected) + + +def test_nested_structured_subarray(): + # Test from gh-16678 + point = np.dtype([('x', float), ('y', float)]) + dt = np.dtype([('code', int), ('points', point, (2,))]) + data = StringIO("100,1,2,3,4\n200,5,6,7,8\n") + expected = np.array( + [ + (100, [(1., 2.), (3., 4.)]), + (200, [(5., 6.), (7., 8.)]), + ], + dtype=dt + ) + assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) + + +def test_structured_dtype_offsets(): + # An aligned structured dtype will have additional padding + dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) + data = StringIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") + expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_negative_row_limits(param): + """skiprows and max_rows should raise for negative parameters.""" + with pytest.raises(ValueError, match="argument must be nonnegative"): + np.loadtxt("foo.bar", **{param: -3}) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_noninteger_row_limits(param): + with pytest.raises(TypeError, match="argument must be an integer"): + np.loadtxt("foo.bar", **{param: 1.0}) + + +@pytest.mark.parametrize( + "data, shape", + [ + ("1 2 3 4 5\n", (1, 5)), # Single row + ("1\n2\n3\n4\n5\n", (5, 1)), # Single column + ] +) +def test_ndmin_single_row_or_col(data, shape): + arr = np.array([1, 2, 3, 4, 5]) + arr2d = arr.reshape(shape) + + assert_array_equal(np.loadtxt(StringIO(data), dtype=int), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=0), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=1), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=2), arr2d) + + +@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) +def test_bad_ndmin(badval): + with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): + np.loadtxt("foo.bar", ndmin=badval) + + +@pytest.mark.parametrize( + "ws", + ( + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_blank_lines_spaces_delimit(ws): + txt = StringIO( + f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1" + ) + # NOTE: It is unclear that the ` # comment` should succeed. Except + # for delimiter=None, which should use any whitespace (and maybe + # should just be implemented closer to Python + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected + ) + + +def test_blank_lines_normal_delimiter(): + txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected + ) + + +@pytest.mark.parametrize("dtype", (float, object)) +def test_maxrows_no_blank_lines(dtype): + txt = StringIO("1.5,2.5\n3.0,4.0\n5.5,6.0") + res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) + assert_equal(res.dtype, dtype) + assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) + + +@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) +def test_exception_message_bad_values(dtype): + txt = StringIO("1,2\n3,XXX\n5,6") + msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + +def test_converters_negative_indices(): + txt = StringIO('1.5,2.5\n3.0,XXX\n5.5,6.0') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) + res = np.loadtxt( + txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None + ) + assert_equal(res, expected) + + +def test_converters_negative_indices_with_usecols(): + txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) + res = np.loadtxt( + txt, + dtype=np.float64, + delimiter=",", + converters=conv, + usecols=[0, -1], + encoding=None, + ) + assert_equal(res, expected) + + +def test_ragged_usecols(): + # usecols, and negative ones, work even with varying number of columns. + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + expected = np.array([[0, 0], [0, 0], [0, 0]]) + res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + assert_equal(res, expected) + + txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n") + with pytest.raises(ValueError, + match="invalid column index -2 at row 1 with 2 columns"): + # There is no -2 column in the second row: + np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + + +def test_empty_usecols(): + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) + assert res.shape == (3,) + assert res.dtype == np.dtype([]) + + +@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) +@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) +def test_large_unicode_characters(c1, c2): + # c1 and c2 span ascii, 16bit and 32bit range. + txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") + res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") + expected = np.array( + [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], + dtype=np.dtype('U12') + ) + assert_equal(res, expected) + + +def test_unicode_with_converter(): + txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") + conv = {0: lambda s: s.upper()} + res = np.loadtxt( + txt, + dtype=np.dtype("U12"), + converters=conv, + delimiter=",", + encoding=None + ) + expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) + assert_equal(res, expected) + + +def test_converter_with_structured_dtype(): + txt = StringIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') + dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) + conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} + res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) + expected = np.array( + [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt + ) + assert_equal(res, expected) + + +def test_converter_with_unicode_dtype(): + """ + With the default 'bytes' encoding, tokens are encoded prior to being passed + to the converter. This means that the output of the converter may be bytes + instead of unicode as expected by `read_rows`. + + This test checks that outputs from the above scenario are properly decoded + prior to parsing by `read_rows`. + """ + txt = StringIO('abc,def\nrst,xyz') + conv = bytes.upper + res = np.loadtxt(txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") + expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) + assert_equal(res, expected) + + +def test_read_huge_row(): + row = "1.5, 2.5," * 50000 + row = row[:-1] + "\n" + txt = StringIO(row * 2) + res = np.loadtxt(txt, delimiter=",", dtype=float) + assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) + + +@pytest.mark.parametrize("dtype", "edfgFDG") +def test_huge_float(dtype): + # Covers a non-optimized path that is rarely taken: + field = "0" * 1000 + ".123456789" + dtype = np.dtype(dtype) + value = np.loadtxt([field], dtype=dtype)[()] + assert value == dtype.type("0.123456789") + + +@pytest.mark.parametrize( + ("given_dtype", "expected_dtype"), + [ + ("S", np.dtype("S5")), + ("U", np.dtype("U5")), + ], +) +def test_string_no_length_given(given_dtype, expected_dtype): + """ + The given dtype is just 'S' or 'U' with no length. In these cases, the + length of the resulting dtype is determined by the longest string found + in the file. + """ + txt = StringIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") + res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") + expected = np.array( + [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype + ) + assert_equal(res, expected) + assert_equal(res.dtype, expected_dtype) + + +def test_float_conversion(): + """ + Some tests that the conversion to float64 works as accurately as the Python + built-in `float` function. In a naive version of the float parser, these + strings resulted in values that were off by an ULP or two. + """ + strings = [ + '0.9999999999999999', + '9876543210.123456', + '5.43215432154321e+300', + '0.901', + '0.333', + ] + txt = StringIO('\n'.join(strings)) + res = np.loadtxt(txt) + expected = np.array([float(s) for s in strings]) + assert_equal(res, expected) + + +def test_bool(): + # Simple test for bool via integer + txt = StringIO("1, 0\n10, -1") + res = np.loadtxt(txt, dtype=bool, delimiter=",") + assert res.dtype == bool + assert_array_equal(res, [[True, False], [True, True]]) + # Make sure we use only 1 and 0 on the byte level: + assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) + + +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_integer_signs(dtype): + dtype = np.dtype(dtype) + assert np.loadtxt(["+2"], dtype=dtype) == 2 + if dtype.kind == "u": + with pytest.raises(ValueError): + np.loadtxt(["-1\n"], dtype=dtype) + else: + assert np.loadtxt(["-2\n"], dtype=dtype) == -2 + + for sign in ["++", "+-", "--", "-+"]: + with pytest.raises(ValueError): + np.loadtxt([f"{sign}2\n"], dtype=dtype) + + +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_implicit_cast_float_to_int_fails(dtype): + txt = StringIO("1.0, 2.1, 3.7\n4, 5, 6") + with pytest.raises(ValueError): + np.loadtxt(txt, dtype=dtype, delimiter=",") + +@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) +@pytest.mark.parametrize("with_parens", (False, True)) +def test_complex_parsing(dtype, with_parens): + s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" + if not with_parens: + s = s.replace("(", "").replace(")", "") + + res = np.loadtxt(StringIO(s), dtype=dtype, delimiter=",") + expected = np.array( + [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype + ) + assert_equal(res, expected) + + +def test_read_from_generator(): + def gen(): + for i in range(4): + yield f"{i},{2*i},{i**2}" + + res = np.loadtxt(gen(), dtype=int, delimiter=",") + expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) + assert_equal(res, expected) + + +def test_read_from_generator_multitype(): + def gen(): + for i in range(3): + yield f"{i} {i / 4}" + + res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") + expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") + assert_equal(res, expected) + + +def test_read_from_bad_generator(): + def gen(): + for entry in ["1,2", b"3, 5", 12738]: + yield entry + + with pytest.raises( + TypeError, match=r"non-string returned while reading data" + ): + np.loadtxt(gen(), dtype="i, i", delimiter=",") + + +@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") +def test_object_cleanup_on_read_error(): + sentinel = object() + + already_read = 0 + def conv(x): + nonlocal already_read + if already_read > 4999: + raise ValueError("failed half-way through!") + already_read += 1 + return sentinel + + txt = StringIO("x\n" * 10000) + + with pytest.raises(ValueError, match="at row 5000, column 1"): + np.loadtxt(txt, dtype=object, converters={0: conv}) + + assert sys.getrefcount(sentinel) == 2 + + +def test_character_not_bytes_compatible(): + """Test exception when a character cannot be encoded as 'S'.""" + data = StringIO("–") # == \u2013 + with pytest.raises(ValueError): + np.loadtxt(data, dtype="S5") + + +@pytest.mark.parametrize("conv", (0, [float], "")) +def test_invalid_converter(conv): + msg = ( + "converters must be a dictionary mapping columns to converter " + "functions or a single callable." + ) + with pytest.raises(TypeError, match=msg): + np.loadtxt(StringIO("1 2\n3 4"), converters=conv) + + +def test_converters_dict_raises_non_integer_key(): + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}) + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}, usecols=0) + + +@pytest.mark.parametrize("bad_col_ind", (3, -3)) +def test_converters_dict_raises_non_col_key(bad_col_ind): + data = StringIO("1 2\n3 4") + with pytest.raises(ValueError, match="converter specified for column"): + np.loadtxt(data, converters={bad_col_ind: int}) + + +def test_converters_dict_raises_val_not_callable(): + with pytest.raises( + TypeError, match="values of the converters dictionary must be callable" + ): + np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) + + +@pytest.mark.parametrize("q", ('"', "'", "`")) +def test_quoted_field(q): + txt = StringIO( + f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" + ) + dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) + expected = np.array( + [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype + ) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) + assert_array_equal(res, expected) + + +def test_quote_support_default(): + """Support for quoted fields is disabled by default.""" + txt = StringIO('"lat,long", 45, 30\n') + dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) + + with pytest.raises(ValueError, match="the number of columns changed"): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + # Enable quoting support with non-None value for quotechar param + txt.seek(0) + expected = np.array([("lat,long", 45., 30.)], dtype=dtype) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, expected) + + +def test_quotechar_multichar_error(): + txt = StringIO("1,2\n3,4") + msg = r".*must be a single unicode character or None" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=",", quotechar="''") + + +def test_comment_multichar_error_with_quote(): + txt = StringIO("1,2\n3,4") + msg = ( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported." + ) + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') + + # A single character string in a tuple is unpacked though: + res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") + assert_equal(res, [[1, 2], [3, 4]]) + + +def test_structured_dtype_with_quotes(): + data = StringIO( + ( + "1000;2.4;'alpha';-34\n" + "2000;3.1;'beta';29\n" + "3500;9.9;'gamma';120\n" + "4090;8.1;'delta';0\n" + "5001;4.4;'epsilon';-99\n" + "6543;7.8;'omega';-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") + assert_array_equal(res, expected) + + +def test_quoted_field_is_not_empty(): + txt = StringIO('1\n\n"4"\n""') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_quoted_field_is_not_empty_nonstrict(): + # Same as test_quoted_field_is_not_empty but check that we are not strict + # about missing closing quote (this is the `csv.reader` default also) + txt = StringIO('1\n\n"4"\n"') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_consecutive_quotechar_escaped(): + txt = StringIO('"Hello, my name is ""Monty""!"') + expected = np.array('Hello, my name is "Monty"!', dtype="U40") + res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') + assert_equal(res, expected) + + +@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) +@pytest.mark.parametrize("ndmin", (0, 1, 2)) +@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) +def test_warn_on_no_data(data, ndmin, usecols): + """Check that a UserWarning is emitted when no data is read from input.""" + if usecols is not None: + expected_shape = (0, 3) + elif ndmin == 2: + expected_shape = (0, 1) # guess a single column?! + else: + expected_shape = (0,) + + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + + with NamedTemporaryFile(mode="w") as fh: + fh.write(data) + fh.seek(0) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + +@pytest.mark.parametrize("skiprows", (2, 3)) +def test_warn_on_skipped_data(skiprows): + data = "1 2 3\n4 5 6" + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, skiprows=skiprows) + + +@pytest.mark.parametrize("dtype", + list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) +@pytest.mark.parametrize("swap", [True, False]) +def test_byteswapping_and_unaligned(dtype, swap): + data = ["x,1\n"] # no need for complicated data + dtype = np.dtype(dtype) + if swap: + dtype = dtype.newbyteorder() + full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) + # The above ensures that the interesting "b" field is unaligned: + assert full_dt.fields["b"][1] == 1 + res = np.loadtxt(data, dtype=full_dt, delimiter=",") + assert res["b"] == dtype.type(1) + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efdFD" + "?") +def test_unicode_whitespace_stripping(dtype): + # Test that all numeric types (and bool) strip whitespace correctly + # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. + # Currently, skip float128 as it did not always support this and has no + # "custom" parsing: + txt = StringIO(' 3 ,"\u202F2\n"') + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, np.array([3, 2]).astype(dtype)) + + +@pytest.mark.parametrize("dtype", "FD") +def test_unicode_whitespace_stripping_complex(dtype): + # Complex has a few extra cases since it has two components and parentheses + line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" + data = [line, line.replace(" ", "\u202F")] + res = np.loadtxt(data, dtype=dtype, delimiter=',') + assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) + + +@pytest.mark.parametrize("dtype", "FD") +@pytest.mark.parametrize("field", + ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) +def test_bad_complex(dtype, field): + with pytest.raises(ValueError): + np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_nul_character_error(dtype): + # Test that a \0 character is correctly recognized as an error even if + # what comes before is valid (not everything gets parsed internally). + if dtype.lower() == "g": + pytest.xfail("longdouble/clongdouble assignment may misbehave.") + with pytest.raises(ValueError): + np.loadtxt(["1\000"], dtype=dtype, delimiter=",", quotechar='"') + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_no_thousands_support(dtype): + # Mainly to document behaviour, Python supports thousands like 1_1. + # (e and G may end up using different conversion and support it, this is + # a bug but happens...) + if dtype == "e": + pytest.skip("half assignment currently uses Python float converter") + if dtype in "eG": + pytest.xfail("clongdouble assignment is buggy (uses `complex` always).") + + assert int("1_1") == float("1_1") == complex("1_1") == 11 + with pytest.raises(ValueError): + np.loadtxt(["1_1\n"], dtype=dtype) + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2\n,3\n"], + ["1,2\n", "2\r,3\n"]]) +def test_bad_newline_in_iterator(data): + # In NumPy <=1.22 this was accepted, because newlines were completely + # ignored when the input was an iterable. This could be changed, but right + # now, we raise an error. + with pytest.raises(ValueError, + match="Found an unquoted embedded newline within a single line"): + np.loadtxt(data, delimiter=",") + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2,3\r\n"], # a universal newline + ["1,2\n", "'2\n',3\n"], # a quoted newline + ["1,2\n", "'2\r',3\n"], + ["1,2\n", "'2\r\n',3\n"], +]) +def test_good_newline_in_iterator(data): + # The quoted newlines will be untransformed here, but are just whitespace. + res = np.loadtxt(data, delimiter=",", quotechar="'") + assert_array_equal(res, [[1., 2.], [2., 3.]]) + + +@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) +def test_universal_newlines_quoted(newline): + # Check that universal newline support within the tokenizer is not applied + # to quoted fields. (note that lines must end in newline or quoted + # fields will not include a newline at all) + data = ['1,"2\n"\n', '3,"4\n', '1"\n'] + data = [row.replace("\n", newline) for row in data] + res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') + assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) + + +def test_null_character(): + # Basic tests to check that the NUL character is not special: + res = np.loadtxt(["1\0002\0003\n", "4\0005\0006"], delimiter="\000") + assert_array_equal(res, [[1, 2, 3], [4, 5, 6]]) + + # Also not as part of a field (avoid unicode/arrays as unicode strips \0) + res = np.loadtxt(["1\000,2\000,3\n", "4\000,5\000,6"], + delimiter=",", dtype=object) + assert res.tolist() == [["1\000", "2\000", "3"], ["4\000", "5\000", "6"]] + + +def test_iterator_fails_getting_next_line(): + class BadSequence: + def __len__(self): + return 100 + + def __getitem__(self, item): + if item == 50: + raise RuntimeError("Bad things happened!") + return f"{item}, {item+1}" + + with pytest.raises(RuntimeError, match="Bad things happened!"): + np.loadtxt(BadSequence(), dtype=int, delimiter=",") + + +class TestCReaderUnitTests: + # These are internal tests for path that should not be possible to hit + # unless things go very very wrong somewhere. + def test_not_an_filelike(self): + with pytest.raises(AttributeError, match=".*read"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_read_fails(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + def read(self, size): + self.counter += 1 + if self.counter > 20: + raise RuntimeError("Bad bad bad!") + return "1,2,3\n" + + with pytest.raises(RuntimeError, match="Bad bad bad!"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_bad_read(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + def read(self, size): + return 1234 # not a string! + + with pytest.raises(TypeError, + match="non-string returned while reading data"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_not_an_iter(self): + with pytest.raises(TypeError, + match="error reading from object, expected an iterable"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False) + + def test_bad_type(self): + with pytest.raises(TypeError, match="internal error: dtype must"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype="i", filelike=False) + + def test_bad_encoding(self): + with pytest.raises(TypeError, match="encoding must be a unicode"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False, encoding=123) + + @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) + def test_manual_universal_newlines(self, newline): + # This is currently not available to users, because we should always + # open files with universal newlines enabled `newlines=None`. + # (And reading from an iterator uses slightly different code paths.) + # We have no real support for `newline="\r"` or `newline="\n" as the + # user cannot specify those options. + data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), + newline="") + + res = np.core._multiarray_umath._load_from_filelike( + data, dtype=np.dtype("U10"), filelike=True, + quote='"', comment="#", skiplines=1) + assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) |