diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-14 18:04:05 +0200 |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-14 18:04:05 +0200 |
commit | 0f1571ce7fb7da0e2ad75f941b29f2d19717e012 (patch) | |
tree | 40ead16555f0a9a69bf9524e2271b262e2cbcde6 /Lib/HTMLParser.py | |
parent | 74592919d67706c3554f42da5226afb8b6feb48e (diff) | |
download | cpython-git-0f1571ce7fb7da0e2ad75f941b29f2d19717e012.tar.gz |
#1745761, #755670, #13357, #12629, #1200313: improve attribute handling in HTMLParser.
Diffstat (limited to 'Lib/HTMLParser.py')
-rw-r--r-- | Lib/HTMLParser.py | 20 |
1 files changed, 11 insertions, 9 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 94ebc7f8dc..cd353f8ca0 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -24,22 +24,23 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') + attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator + (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value ) - )? - ) - )* + )?\s* + )* + )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') @@ -254,6 +255,7 @@ class HTMLParser(markupbase.ParserBase): elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] + if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() |