diff options
author | Ned Batchelder <ned@nedbatchelder.com> | 2014-12-28 08:45:17 -0500 |
---|---|---|
committer | Ned Batchelder <ned@nedbatchelder.com> | 2014-12-28 08:45:17 -0500 |
commit | 228c5a07e04eda70074ce40b25512700f5168dc4 (patch) | |
tree | 2c308e7b249fc42ed7118abb364ef643d828cf2f | |
parent | e748d2cbe359876130fbd9477ecbbb320b9df75b (diff) | |
download | python-coveragepy-git-228c5a07e04eda70074ce40b25512700f5168dc4.tar.gz |
Make source_encoding stricter about its arguments, and test it everywhere
-rw-r--r-- | coverage/phystokens.py | 19 | ||||
-rw-r--r-- | tests/test_phystokens.py | 109 |
2 files changed, 76 insertions, 52 deletions
diff --git a/coverage/phystokens.py b/coverage/phystokens.py index bf55e8a3..95776251 100644 --- a/coverage/phystokens.py +++ b/coverage/phystokens.py @@ -148,11 +148,17 @@ generate_tokens = CachedTokenizer().generate_tokens def _source_encoding_py2(source): - """Determine the encoding for `source` (a string), according to PEP 263. + """Determine the encoding for `source`, according to PEP 263. - Returns a string, the name of the encoding. + Arguments: + source (byte string): the text of the program. + + Returns: + string: the name of the encoding. """ + assert isinstance(source, bytes) + # Do this so the detect_encode code we copied will work. readline = iter(source.splitlines(True)).next @@ -240,11 +246,16 @@ def _source_encoding_py2(source): def _source_encoding_py3(source): - """Determine the encoding for `source` (a string), according to PEP 263. + """Determine the encoding for `source`, according to PEP 263. + + Arguments: + source (byte string): the text of the program. - Returns a string, the name of the encoding. + Returns: + string: the name of the encoding. """ + assert isinstance(source, bytes) readline = iter(source.splitlines(True)).__next__ return tokenize.detect_encoding(readline)[0] diff --git a/tests/test_phystokens.py b/tests/test_phystokens.py index 10e0225f..bbd956ea 100644 --- a/tests/test_phystokens.py +++ b/tests/test_phystokens.py @@ -1,8 +1,11 @@ """Tests for Coverage.py's improved tokenizer.""" -#from __future__ import unicode_literals +import os +import re +import sys + +from nose.plugins.skip import SkipTest -import os, re, sys from tests.coveragetest import CoverageTest from coverage.phystokens import source_token_lines, source_encoding @@ -83,49 +86,59 @@ class PhysTokensTest(CoverageTest): self.check_file_tokenization(stress) -# source_encoding is only used on Py2. -if sys.version_info < (3, 0): - class SourceEncodingTest(CoverageTest): - """Tests of source_encoding() for detecting encodings on Py2.""" - - run_in_temp_dir = False - - def test_detect_source_encoding(self): - # Various forms from http://www.python.org/dev/peps/pep-0263/ - source = "# coding=cp850\n\n" - self.assertEqual(source_encoding(source), 'cp850') - source = "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n" - self.assertEqual(source_encoding(source), 'utf-8') - source = "#!/usr/bin/python\n# vim: set fileencoding=utf8 :\n" - self.assertEqual(source_encoding(source), 'utf8') - source = "# This Python file uses this encoding: utf-8\n" - self.assertEqual(source_encoding(source), 'utf-8') - - def test_detect_source_encoding_not_in_comment(self): - # Should not detect anything here - source = 'def parse(src, encoding=None):\n pass' - self.assertEqual(source_encoding(source), 'ascii') - - def test_detect_source_encoding_on_second_line(self): - # A coding declaration should be found despite a first blank line. - source = "\n# coding=cp850\n\n" - self.assertEqual(source_encoding(source), 'cp850') - - def test_dont_detect_source_encoding_on_third_line(self): - # A coding declaration doesn't count on the third line. - source = "\n\n# coding=cp850\n\n" - self.assertEqual(source_encoding(source), 'ascii') - - def test_detect_source_encoding_of_empty_file(self): - # An important edge case. - self.assertEqual(source_encoding(""), 'ascii') - - def test_bom(self): - # A BOM means utf-8. - source = "\xEF\xBB\xBFtext = 'hello'\n" - self.assertEqual(source_encoding(source), 'utf-8-sig') - - # But it has to be the only authority. - source = "\xEF\xBB\xBF# coding: cp850\n" - with self.assertRaises(SyntaxError): - source_encoding(source) +# The default encoding is different in Python 2 and Python 3. +if sys.version_info >= (3, 0): + DEF_ENCODING = "utf-8" +else: + DEF_ENCODING = "ascii" + + +class SourceEncodingTest(CoverageTest): + """Tests of source_encoding() for detecting encodings.""" + + run_in_temp_dir = False + + def test_detect_source_encoding(self): + # Various forms from http://www.python.org/dev/peps/pep-0263/ + source = b"# coding=cp850\n\n" + self.assertEqual(source_encoding(source), 'cp850') + source = b"#!/usr/bin/python\n# -*- coding: utf-8 -*-\n" + self.assertEqual(source_encoding(source), 'utf-8') + source = b"#!/usr/bin/python\n# vim: set fileencoding=utf8 :\n" + self.assertEqual(source_encoding(source), 'utf8') + source = b"# This Python file uses this encoding: utf-8\n" + self.assertEqual(source_encoding(source), 'utf-8') + + def test_detect_source_encoding_not_in_comment(self): + if '__pypy__' in sys.builtin_module_names: + if sys.version_info > (3, 0): + # PyPy3 gets this case wrong. Not sure what I can do about it, + # so skip the test. + raise SkipTest + # Should not detect anything here + source = b'def parse(src, encoding=None):\n pass' + self.assertEqual(source_encoding(source), DEF_ENCODING) + + def test_detect_source_encoding_on_second_line(self): + # A coding declaration should be found despite a first blank line. + source = b"\n# coding=cp850\n\n" + self.assertEqual(source_encoding(source), 'cp850') + + def test_dont_detect_source_encoding_on_third_line(self): + # A coding declaration doesn't count on the third line. + source = b"\n\n# coding=cp850\n\n" + self.assertEqual(source_encoding(source), DEF_ENCODING) + + def test_detect_source_encoding_of_empty_file(self): + # An important edge case. + self.assertEqual(source_encoding(b""), DEF_ENCODING) + + def test_bom(self): + # A BOM means utf-8. + source = b"\xEF\xBB\xBFtext = 'hello'\n" + self.assertEqual(source_encoding(source), 'utf-8-sig') + + # But it has to be the only authority. + source = b"\xEF\xBB\xBF# coding: cp850\n" + with self.assertRaises(SyntaxError): + source_encoding(source) |