summaryrefslogtreecommitdiff
path: root/tests/test_phystokens.py
blob: e15400b63ba7ebf3c9aa386b53513539fa525e9a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""Tests for Coverage.py's improved tokenizer."""

import os, re, sys
from tests.coveragetest import CoverageTest
from coverage.phystokens import source_token_lines, source_encoding


SIMPLE = """\
# yay!
def foo():
  say('two = %d' % 2)
"""

MIXED_WS = """\
def hello():
        a="Hello world!"
\tb="indented"
"""

HERE = os.path.split(__file__)[0]


class PhysTokensTest(CoverageTest):
    """Tests for Coverage.py's improver tokenizer."""

    run_in_temp_dir = False

    def check_tokenization(self, source):
        """Tokenize `source`, then put it back together, should be the same."""
        tokenized = ""
        for line in source_token_lines(source):
            text = "".join(t for _, t in line)
            tokenized += text + "\n"
        # source_token_lines doesn't preserve trailing spaces, so trim all that
        # before comparing.
        source = source.replace('\r\n', '\n')
        source = re.sub(r"(?m)[ \t]+$", "", source)
        tokenized = re.sub(r"(?m)[ \t]+$", "", tokenized)
        self.assertMultiLineEqual(source, tokenized)

    def check_file_tokenization(self, fname):
        """Use the contents of `fname` for `check_tokenization`."""
        self.check_tokenization(open(fname).read())

    def test_simple(self):
        self.assertEqual(list(source_token_lines(SIMPLE)),
            [
                [('com', "# yay!")],
                [('key', 'def'), ('ws', ' '), ('nam', 'foo'), ('op', '('),
                            ('op', ')'), ('op', ':')],
                [('ws', '  '), ('nam', 'say'), ('op', '('),
                            ('str', "'two = %d'"), ('ws', ' '), ('op', '%'),
                            ('ws', ' '), ('num', '2'), ('op', ')')]
            ])
        self.check_tokenization(SIMPLE)

    def test_tab_indentation(self):
        # Mixed tabs and spaces...
        self.assertEqual(list(source_token_lines(MIXED_WS)),
            [
                [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('),
                            ('op', ')'), ('op', ':')],
                [('ws', '        '), ('nam', 'a'), ('op', '='),
                            ('str', '"Hello world!"')],
                [('ws', '        '), ('nam', 'b'), ('op', '='),
                            ('str', '"indented"')],
            ])

    def test_tokenize_real_file(self):
        # Check the tokenization of a real file (large, btw).
        real_file = os.path.join(HERE, "test_coverage.py")
        self.check_file_tokenization(real_file)

    def test_stress(self):
        # Check the tokenization of a stress-test file.
        stress = os.path.join(HERE, "stress_phystoken.tok")
        self.check_file_tokenization(stress)
        stress = os.path.join(HERE, "stress_phystoken_dos.tok")
        self.check_file_tokenization(stress)


# source_encoding is only used on Py2.
if sys.version_info < (3, 0):
    class SourceEncodingTest(CoverageTest):
        """Tests of source_encoding() for detecting encodings on Py2."""

        run_in_temp_dir = False

        def test_detect_source_encoding(self):
            # Various forms from http://www.python.org/dev/peps/pep-0263/
            source = "# coding=cp850\n\n"
            self.assertEqual(source_encoding(source), 'cp850')
            source = "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n"
            self.assertEqual(source_encoding(source), 'utf-8')
            source = "#!/usr/bin/python\n# vim: set fileencoding=utf8 :\n"
            self.assertEqual(source_encoding(source), 'utf8')
            source = "# This Python file uses this encoding: utf-8\n"
            self.assertEqual(source_encoding(source), 'utf-8')

        def test_detect_source_encoding_on_second_line(self):
            # A coding declaration should be found despite a first blank line.
            source = "\n# coding=cp850\n\n"
            self.assertEqual(source_encoding(source), 'cp850')

        def test_dont_detect_source_encoding_on_third_line(self):
            # A coding declaration doesn't count on the third line.
            source = "\n\n# coding=cp850\n\n"
            self.assertEqual(source_encoding(source), 'ascii')

        def test_detect_source_encoding_of_empty_file(self):
            # An important edge case.
            self.assertEqual(source_encoding(""), 'ascii')

        def test_bom(self):
            # A BOM means utf-8.
            source = "\xEF\xBB\xBFtext = 'hello'\n"
            self.assertEqual(source_encoding(source), 'utf-8-sig')

            # But it has to be the only authority.
            source = "\xEF\xBB\xBF# coding: cp850\n"
            with self.assertRaises(SyntaxError):
                source_encoding(source)