diff options
Diffstat (limited to 'coverage/phystokens.py')
-rw-r--r-- | coverage/phystokens.py | 60 |
1 files changed, 44 insertions, 16 deletions
diff --git a/coverage/phystokens.py b/coverage/phystokens.py index ed6bd238..b34b1c3b 100644 --- a/coverage/phystokens.py +++ b/coverage/phystokens.py @@ -1,13 +1,18 @@ +# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 +# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt + """Better tokenizing for coverage.py.""" import codecs import keyword import re +import sys import token import tokenize from coverage import env from coverage.backward import iternext +from coverage.misc import contract def phys_tokens(toks): @@ -66,6 +71,7 @@ def phys_tokens(toks): last_lineno = elineno +@contract(source='unicode') def source_token_lines(source): """Generate a series of lines, one for each line in `source`. @@ -104,7 +110,7 @@ def source_token_lines(source): mark_end = False else: if mark_start and scol > col: - line.append(("ws", " " * (scol - col))) + line.append(("ws", u" " * (scol - col))) mark_start = False tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] if ttype == token.NAME and keyword.iskeyword(ttext): @@ -134,11 +140,10 @@ class CachedTokenizer(object): self.last_text = None self.last_tokens = None + @contract(text='unicode') def generate_tokens(self, text): """A stand-in for `tokenize.generate_tokens`.""" - # Check the type first so we don't compare bytes to unicode and get - # warnings. - if type(text) != type(self.last_text) or text != self.last_text: + if text != self.last_text: self.last_text = text readline = iternext(text.splitlines(True)) self.last_tokens = list(tokenize.generate_tokens(readline)) @@ -148,14 +153,15 @@ class CachedTokenizer(object): generate_tokens = CachedTokenizer().generate_tokens +COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) + +@contract(source='bytes') def _source_encoding_py2(source): """Determine the encoding for `source`, according to PEP 263. - Arguments: - source (byte string): the text of the program. + `source` is a byte string, the text of the program. - Returns: - string: the name of the encoding. + Returns a string, the name of the encoding. """ assert isinstance(source, bytes) @@ -165,8 +171,6 @@ def _source_encoding_py2(source): # This is mostly code adapted from Py3.2's tokenize module. - cookie_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)") - def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" # Only care about the first 12 characters. @@ -204,7 +208,7 @@ def _source_encoding_py2(source): except UnicodeDecodeError: return None - matches = cookie_re.findall(line_string) + matches = COOKIE_RE.findall(line_string) if not matches: return None encoding = _get_normal_name(matches[0]) @@ -246,17 +250,15 @@ def _source_encoding_py2(source): return default +@contract(source='bytes') def _source_encoding_py3(source): """Determine the encoding for `source`, according to PEP 263. - Arguments: - source (byte string): the text of the program. + `source` is a byte string: the text of the program. - Returns: - string: the name of the encoding. + Returns a string, the name of the encoding. """ - assert isinstance(source, bytes) readline = iternext(source.splitlines(True)) return tokenize.detect_encoding(readline)[0] @@ -265,3 +267,29 @@ if env.PY3: source_encoding = _source_encoding_py3 else: source_encoding = _source_encoding_py2 + + +@contract(source='unicode') +def compile_unicode(source, filename, mode): + """Just like the `compile` builtin, but works on any Unicode string. + + Python 2's compile() builtin has a stupid restriction: if the source string + is Unicode, then it may not have a encoding declaration in it. Why not? + Who knows! It also decodes to utf8, and then tries to interpret those utf8 + bytes according to the encoding declaration. Why? Who knows! + + This function neuters the coding declaration, and compiles it. + + """ + source = neuter_encoding_declaration(source) + if env.PY2 and isinstance(filename, unicode): + filename = filename.encode(sys.getfilesystemencoding(), "replace") + code = compile(source, filename, mode) + return code + + +@contract(source='unicode', returns='unicode') +def neuter_encoding_declaration(source): + """Return `source`, with any encoding declaration neutered.""" + source = COOKIE_RE.sub("# (deleted declaration)", source, count=1) + return source |