3 files changed, 146 insertions, 49 deletions
diff --git a/coverage/backward.py b/coverage/backward.py
index 93cb793a..236bef8e 100644
--- a/coverage/backward.py
+++ b/coverage/backward.py
@@ -6,7 +6,7 @@
 # W0611: Unused import blah
 # W0622: Redefining built-in blah
 
-import os, sys
+import os, re, sys
 
 # Python 2.3 doesn't have `set`
 try:
@@ -72,30 +72,38 @@ try:
 except ImportError:
     import ConfigParser as configparser
 
-# Python 3.2 provides `tokenize.open`, the best way to open source files.
-import tokenize
-try:
-    open_source = tokenize.open     # pylint: disable=E1101
-except AttributeError:
+# Reading Python source and interpreting the coding comment is a big deal.
+if sys.version_info >= (3, 0):
+    # Python 3.2 provides `tokenize.open`, the best way to open source files.
+    import tokenize
     try:
-        detect_encoding = tokenize.detect_encoding  # pylint: disable=E1101
+        open_source = tokenize.open     # pylint: disable=E1101
     except AttributeError:
-        def open_source(fname):
-            """Open a source file the best way."""
-            return open(fname, "rU")
-    else:
-        from io import TextIOWrapper
-        # Copied from the 3.2 stdlib:
-        def open_source(fname):
-            """Open a file in read only mode using the encoding detected by
-            detect_encoding().
-            """
-            buffer = open(fname, 'rb')
-            encoding, _ = detect_encoding(buffer.readline)
-            buffer.seek(0)
-            text = TextIOWrapper(buffer, encoding, line_buffering=True)
-            text.mode = 'r'
-            return text
+        try:
+            detect_encoding = tokenize.detect_encoding  # pylint: disable=E1101
+        except AttributeError:
+            assert 3 == 4
+            def open_source(fname):
+                """Open a source file the best way."""
+                return open(fname, "rU")
+        else:
+            from io import TextIOWrapper
+            # Copied from the 3.2 stdlib:
+            def open_source(fname):
+                """Open a file in read only mode using the encoding detected by
+                detect_encoding().
+                """
+                buffer = open(fname, 'rb')
+                encoding, _ = detect_encoding(buffer.readline)
+                buffer.seek(0)
+                text = TextIOWrapper(buffer, encoding, line_buffering=True)
+                text.mode = 'r'
+                return text
+else:
+    def open_source(fname):
+        """Open a source file the best way."""
+        return open(fname, "rU")
+
 
 # Python 3.x is picky about bytes and strings, so provide methods to
 # get them right, and make them no-ops in 2.x
@@ -117,27 +125,6 @@ else:
         """Convert bytes `b` to a string (no-op in 2.x)."""
         return b
 
-# A few details about writing encoded text are different in 2.x and 3.x.
-if sys.version_info >= (3, 0):
-    def write_encoded(fname, text, encoding='utf8', errors='strict'):
-        '''Write string `text` to file names `fname`, with encoding.'''
-        # Don't use "with", so that this file is still good for old 2.x.
-        f = open(fname, 'w', encoding=encoding, errors=errors)
-        try:
-            f.write(text)
-        finally:
-            f.close()
-else:
-    # It's not clear that using utf8 strings in 2.x is the right thing to do.
-    def write_encoded(fname, text, encoding='utf8', errors='strict'):
-        '''Write utf8 string `text` to file names `fname`, with encoding.'''
-        import codecs
-        f = codecs.open(fname, 'w', encoding=encoding, errors=errors)
-        try:
-            f.write(text.decode('utf8'))
-        finally:
-            f.close()
-
 # Md5 is available in different places.
 try:
     import hashlib
diff --git a/coverage/html.py b/coverage/html.py
index 00b92587..09683ad8 100644
--- a/coverage/html.py
+++ b/coverage/html.py
@@ -1,11 +1,11 @@
 """HTML reporting for Coverage."""
 
-import os, re, shutil
+import os, re, shutil, sys
 
 import coverage
-from coverage.backward import pickle, write_encoded
+from coverage.backward import pickle
 from coverage.misc import CoverageException, Hasher
-from coverage.phystokens import source_token_lines
+from coverage.phystokens import source_token_lines, source_encoding
 from coverage.report import Reporter
 from coverage.templite import Templite
 
@@ -100,7 +100,11 @@ class HtmlReporter(Reporter):
 
     def write_html(self, fname, html):
         """Write `html` to `fname`, properly encoded."""
-        write_encoded(fname, html, 'ascii', 'xmlcharrefreplace')
+        fout = open(fname, "wb")
+        try:
+            fout.write(html.encode('ascii', 'xmlcharrefreplace'))
+        finally:
+            fout.close()
 
     def file_hash(self, source, cu):
         """Compute a hash that changes if the file needs to be re-reported."""
@@ -128,6 +132,12 @@ class HtmlReporter(Reporter):
 
         self.status.set_file_hash(flat_rootname, this_hash)
 
+        # If need be, determine the encoding of the source file. We use it
+        # later to properly write the HTML.
+        if sys.version_info < (3, 0):
+            encoding = source_encoding(source)
+
+        # Get the numbers for this file.
         nums = analysis.numbers
 
         missing_branch_arcs = analysis.missing_branch_arcs()
@@ -195,6 +205,8 @@ class HtmlReporter(Reporter):
         html_path = os.path.join(self.directory, html_filename)
 
         html = spaceless(self.source_tmpl.render(locals()))
+        if sys.version_info < (3, 0):
+            html = html.decode(encoding)
         self.write_html(html_path, html)
 
         # Save this file's information for the index file.
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
index fc4f2c90..850f78bd 100644
--- a/coverage/phystokens.py
+++ b/coverage/phystokens.py
@@ -1,6 +1,6 @@
 """Better tokenizing for coverage.py."""
 
-import keyword, re, token, tokenize
+import codecs, keyword, re, sys, token, tokenize
 from coverage.backward import StringIO              # pylint: disable=W0622
 
 def phys_tokens(toks):
@@ -106,3 +106,101 @@ def source_token_lines(source):
 
     if line:
         yield line
+
+def source_encoding(source):
+    """Determine the encoding for `source` (a string), according to PEP 263.
+
+    Returns a string, the name of the encoding.
+
+    """
+    # Note: this function should never be called on Python 3, since py3 has
+    # built-in tools to do this.
+    assert sys.version_info < (3, 0)
+
+    # This is mostly code adapted rom Py3.2's tokenize module.
+
+    cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+
+    # Do this so the detect_encode code we copied will work.
+    readline = iter(source.splitlines()).next
+
+    def _get_normal_name(orig_enc):
+        """Imitates get_normal_name in tokenizer.c."""
+        # Only care about the first 12 characters.
+        enc = orig_enc[:12].lower().replace("_", "-")
+        if re.match(r"^utf-8($|-)", enc):
+            return "utf-8"
+        if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
+            return "iso-8859-1"
+        return orig_enc
+
+    # From detect_encode():
+    # It detects the encoding from the presence of a utf-8 bom or an encoding
+    # cookie as specified in pep-0263.  If both a bom and a cookie are present,
+    # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+    # invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+    # 'utf-8-sig' is returned.
+
+    # If no encoding is specified, then the default will be returned.  The
+    # default varied with version.
+
+    if sys.version_info <= (2, 4):
+        default = 'iso-8859-1'
+    else:
+        default = 'ascii'
+
+    bom_found = False
+    encoding = None
+
+    def read_or_stop():
+        """Get the next source line, or ''."""
+        try:
+            return readline()
+        except StopIteration:
+            return ''
+
+    def find_cookie(line):
+        """Find an encoding cookie in `line`."""
+        try:
+            line_string = line.decode('ascii')
+        except UnicodeDecodeError:
+            return None
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = _get_normal_name(matches[0])
+        try:
+            codec = codecs.lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found:
+            if codec.name != 'utf-8':
+                # This behaviour mimics the Python interpreter
+                raise SyntaxError('encoding problem: utf-8')
+            encoding += '-sig'
+        return encoding
+
+    first = read_or_stop()
+    if first.startswith(codecs.BOM_UTF8):
+        bom_found = True
+        first = first[3:]
+        default = 'utf-8-sig'
+    if not first:
+        return default
+
+    encoding = find_cookie(first)
+    if encoding:
+        return encoding
+
+    second = read_or_stop()
+    if not second:
+        return default
+
+    encoding = find_cookie(second)
+    if encoding:
+        return encoding
+
+    return default