diff options
author | Ned Batchelder <ned@nedbatchelder.com> | 2009-11-15 11:12:12 -0500 |
---|---|---|
committer | Ned Batchelder <ned@nedbatchelder.com> | 2009-11-15 11:12:12 -0500 |
commit | ed8c7ca1c13188bbeed40bb132fa06690e510b34 (patch) | |
tree | f78a69636ad8081d125eb2c26bdb06ce65fc671e /coverage/phystokens.py | |
parent | a4c6cbf897c62cd708195aaed924ec05de4ebb7b (diff) | |
download | python-coveragepy-ed8c7ca1c13188bbeed40bb132fa06690e510b34.tar.gz |
Fix a problem with syntax coloring continued lines, and refactor for testability, and add tests. Fixes issue #30.
Diffstat (limited to 'coverage/phystokens.py')
-rw-r--r-- | coverage/phystokens.py | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/coverage/phystokens.py b/coverage/phystokens.py new file mode 100644 index 0000000..7eebb8a --- /dev/null +++ b/coverage/phystokens.py @@ -0,0 +1,101 @@ +"""Better tokenizing for coverage.py.""" + +import keyword, re, token, tokenize +from coverage.backward import StringIO # pylint: disable-msg=W0622 + +def phys_tokens(toks): + """Return all physical tokens, even line continuations. + + tokenize.generate_tokens() doesn't return a token for the backslash that + continues lines. This wrapper provides those tokens so that we can + re-create a faithful representation of the original source. + + Returns the same values as generate_tokens() + + """ + last_line = None + last_lineno = -1 + for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: + if last_lineno != elineno: + if last_line and last_line[-2:] == "\\\n": + # We are at the beginning of a new line, and the last line + # ended with a backslash. We probably have to inject a + # backslash token into the stream. Unfortunately, there's more + # to figure out. This code:: + # + # usage = """\ + # HEY THERE + # """ + # + # triggers this condition, but the token text is:: + # + # '"""\\\nHEY THERE\n"""' + # + # so we need to figure out if the backslash is already in the + # string token or not. + inject_backslash = True + if ttype == token.STRING: + if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': + # It's a multiline string and the first line ends with + # a backslash, so we don't need to inject another. + inject_backslash = False + if inject_backslash: + # Figure out what column the backslash is in. + ccol = len(last_line.split("\n")[-2]) - 1 + # Yield the token, with a fake token type. + yield ( + 99999, "\\\n", + (slineno, ccol), (slineno, ccol+2), + last_line + ) + last_line = ltext + yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext + last_lineno = elineno + + +def source_token_lines(source): + """Generate a series of lines, one for each line in `source`. + + Each line is a list of pairs, each pair is a token:: + + [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] + + Each pair has a token class, and the token text. + + If you concatenate all the token texts, and then join them with newlines, + you should have your original `source` back, with two differences: + trailing whitespace is not preserved, and a final line with no newline + is indistinguishable from a final line with a newline. + + """ + ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] + line = [] + col = 0 + tokgen = tokenize.generate_tokens(StringIO(source).readline) + for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): + mark_start = True + for part in re.split('(\n)', ttext): + if part == '\n': + yield line + line = [] + col = 0 + mark_end = False + elif part == '': + mark_end = False + elif ttype in ws_tokens: + mark_end = False + else: + if mark_start and scol > col: + line.append(("ws", " " * (scol - col))) + mark_start = False + tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] + if ttype == token.NAME and keyword.iskeyword(ttext): + tok_class = "key" + line.append((tok_class, part)) + mark_end = True + scol = 0 + if mark_end: + col = ecol + + if line: + yield line |