diff options
| author | Benjamin Peterson <benjamin@python.org> | 2008-12-12 01:25:05 +0000 | 
|---|---|---|
| committer | Benjamin Peterson <benjamin@python.org> | 2008-12-12 01:25:05 +0000 | 
| commit | 433f32c3be3b23adc4ec389ff9e78f49c7288f3d (patch) | |
| tree | 569e1c182105ecf44227edfa5bd0169a6ae3e6e8 | |
| parent | e675f08e0333aacfc37f7995ab22e436f2862e2a (diff) | |
| download | cpython-git-433f32c3be3b23adc4ec389ff9e78f49c7288f3d.tar.gz | |
raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021
| -rw-r--r-- | Lib/test/test_tokenize.py | 2 | ||||
| -rw-r--r-- | Lib/tokenize.py | 33 | ||||
| -rw-r--r-- | Misc/NEWS | 3 | 
3 files changed, 25 insertions, 13 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 8fbd216ac6..75a7a809b4 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -795,6 +795,8 @@ class TestDetectEncoding(TestCase):          self.assertEquals(encoding, 'utf-8')          self.assertEquals(consumed_lines, []) +        readline = self.get_readline((b'# coding: bad\n',)) +        self.assertRaises(SyntaxError, detect_encoding, readline)  class TestTokenize(TestCase): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index ec5a79a645..16c4f3f029 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '  import re, string, sys  from token import * -from codecs import lookup +from codecs import lookup, BOM_UTF8  from itertools import chain, repeat  cookie_re = re.compile("coding[:=]\s*([-\w.]+)") @@ -251,11 +251,11 @@ def detect_encoding(readline):      It detects the encoding from the presence of a utf-8 bom or an encoding      cookie as specified in pep-0263. If both a bom and a cookie are present, -    but disagree, a SyntaxError will be raised. +    but disagree, a SyntaxError will be raised. If the encoding cookie is an +    invalid charset, raise a SyntaxError.      If no encoding is specified, then the default of 'utf-8' will be returned.      """ -    utf8_bom = b'\xef\xbb\xbf'      bom_found = False      encoding = None      def read_or_stop(): @@ -268,18 +268,25 @@ def detect_encoding(readline):          try:              line_string = line.decode('ascii')          except UnicodeDecodeError: -            pass -        else: -            matches = cookie_re.findall(line_string) -            if matches: -                encoding = matches[0] -                if bom_found and lookup(encoding).name != 'utf-8': -                    # This behaviour mimics the Python interpreter -                    raise SyntaxError('encoding problem: utf-8') -                return encoding +            return None + +        matches = cookie_re.findall(line_string) +        if not matches: +            return None +        encoding = matches[0] +        try: +            codec = lookup(encoding) +        except LookupError: +            # This behaviour mimics the Python interpreter +            raise SyntaxError("unknown encoding: " + encoding) + +        if bom_found and codec.name != 'utf-8': +            # This behaviour mimics the Python interpreter +            raise SyntaxError('encoding problem: utf-8') +        return encoding      first = read_or_stop() -    if first.startswith(utf8_bom): +    if first.startswith(BOM_UTF8):          bom_found = True          first = first[3:]      if not first: @@ -45,6 +45,9 @@ Core and Builtins  Library  ------- +- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the +  codec cannot be found.  This is for compatibility with the builtin behavior. +  - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to    give correct results in the case where one argument is a quiet NaN    and the other is a finite number that requires rounding.  | 
