1 files changed, 44 insertions, 16 deletions
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
index ed6bd238..b34b1c3b 100644
--- a/coverage/phystokens.py
+++ b/coverage/phystokens.py
@@ -1,13 +1,18 @@
+# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
+# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
+
 """Better tokenizing for coverage.py."""
 
 import codecs
 import keyword
 import re
+import sys
 import token
 import tokenize
 
 from coverage import env
 from coverage.backward import iternext
+from coverage.misc import contract
 
 
 def phys_tokens(toks):
@@ -66,6 +71,7 @@ def phys_tokens(toks):
         last_lineno = elineno
 
 
+@contract(source='unicode')
 def source_token_lines(source):
     """Generate a series of lines, one for each line in `source`.
 
@@ -104,7 +110,7 @@ def source_token_lines(source):
                 mark_end = False
             else:
                 if mark_start and scol > col:
-                    line.append(("ws", " " * (scol - col)))
+                    line.append(("ws", u" " * (scol - col)))
                     mark_start = False
                 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
                 if ttype == token.NAME and keyword.iskeyword(ttext):
@@ -134,11 +140,10 @@ class CachedTokenizer(object):
         self.last_text = None
         self.last_tokens = None
 
+    @contract(text='unicode')
     def generate_tokens(self, text):
         """A stand-in for `tokenize.generate_tokens`."""
-        # Check the type first so we don't compare bytes to unicode and get
-        # warnings.
-        if type(text) != type(self.last_text) or text != self.last_text:
+        if text != self.last_text:
             self.last_text = text
             readline = iternext(text.splitlines(True))
             self.last_tokens = list(tokenize.generate_tokens(readline))
@@ -148,14 +153,15 @@ class CachedTokenizer(object):
 generate_tokens = CachedTokenizer().generate_tokens
 
 
+COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
+
+@contract(source='bytes')
 def _source_encoding_py2(source):
     """Determine the encoding for `source`, according to PEP 263.
 
-    Arguments:
-        source (byte string): the text of the program.
+    `source` is a byte string, the text of the program.
 
-    Returns:
-        string: the name of the encoding.
+    Returns a string, the name of the encoding.
 
     """
     assert isinstance(source, bytes)
@@ -165,8 +171,6 @@ def _source_encoding_py2(source):
 
     # This is mostly code adapted from Py3.2's tokenize module.
 
-    cookie_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)")
-
     def _get_normal_name(orig_enc):
         """Imitates get_normal_name in tokenizer.c."""
         # Only care about the first 12 characters.
@@ -204,7 +208,7 @@ def _source_encoding_py2(source):
         except UnicodeDecodeError:
             return None
 
-        matches = cookie_re.findall(line_string)
+        matches = COOKIE_RE.findall(line_string)
         if not matches:
             return None
         encoding = _get_normal_name(matches[0])
@@ -246,17 +250,15 @@ def _source_encoding_py2(source):
     return default
 
 
+@contract(source='bytes')
 def _source_encoding_py3(source):
     """Determine the encoding for `source`, according to PEP 263.
 
-    Arguments:
-        source (byte string): the text of the program.
+    `source` is a byte string: the text of the program.
 
-    Returns:
-        string: the name of the encoding.
+    Returns a string, the name of the encoding.
 
     """
-    assert isinstance(source, bytes)
     readline = iternext(source.splitlines(True))
     return tokenize.detect_encoding(readline)[0]
 
@@ -265,3 +267,29 @@ if env.PY3:
     source_encoding = _source_encoding_py3
 else:
     source_encoding = _source_encoding_py2
+
+
+@contract(source='unicode')
+def compile_unicode(source, filename, mode):
+    """Just like the `compile` builtin, but works on any Unicode string.
+
+    Python 2's compile() builtin has a stupid restriction: if the source string
+    is Unicode, then it may not have a encoding declaration in it.  Why not?
+    Who knows!  It also decodes to utf8, and then tries to interpret those utf8
+    bytes according to the encoding declaration.  Why? Who knows!
+
+    This function neuters the coding declaration, and compiles it.
+
+    """
+    source = neuter_encoding_declaration(source)
+    if env.PY2 and isinstance(filename, unicode):
+        filename = filename.encode(sys.getfilesystemencoding(), "replace")
+    code = compile(source, filename, mode)
+    return code
+
+
+@contract(source='unicode', returns='unicode')
+def neuter_encoding_declaration(source):
+    """Return `source`, with any encoding declaration neutered."""
+    source = COOKIE_RE.sub("# (deleted declaration)", source, count=1)
+    return source