diff options
author | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-01 14:09:56 +0200 |
---|---|---|
committer | Ezio Melotti <ezio.melotti@gmail.com> | 2011-11-01 14:09:56 +0200 |
commit | 7e82b276dd5c1f786e7bd3c1554ac2017a909ab9 (patch) | |
tree | 93fdd9d701f045ebd6afc9e3b746b2fb1eec1ae9 /Lib/HTMLParser.py | |
parent | 1f3b84f9713c8752ffdd2cb6141a731128fe0e14 (diff) | |
download | cpython-git-7e82b276dd5c1f786e7bd3c1554ac2017a909ab9.tar.gz |
#670664: Fix HTMLParser to correctly handle the content of ``<script>...</script>`` and ``<style>...</style>``.
Diffstat (limited to 'Lib/HTMLParser.py')
-rw-r--r-- | Lib/HTMLParser.py | 22 |
1 files changed, 18 insertions, 4 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 884d2a53c5..94ebc7f8dc 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -43,6 +43,8 @@ locatestarttagend = re.compile(r""" \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') +# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between +# </ and the tag name, so maybe this should be fixed endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') @@ -96,6 +98,7 @@ class HTMLParser(markupbase.ParserBase): self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal + self.cdata_elem = None markupbase.ParserBase.reset(self) def feed(self, data): @@ -120,11 +123,13 @@ class HTMLParser(markupbase.ParserBase): """Return full source of start tag: '<...>'.""" return self.__starttag_text - def set_cdata_mode(self): + def set_cdata_mode(self, elem): self.interesting = interesting_cdata + self.cdata_elem = elem.lower() def clear_cdata_mode(self): self.interesting = interesting_normal + self.cdata_elem = None # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is @@ -270,7 +275,7 @@ class HTMLParser(markupbase.ParserBase): else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode() + self.set_cdata_mode(tag) return endpos # Internal -- check to see if we have a complete starttag; return end @@ -314,9 +319,18 @@ class HTMLParser(markupbase.ParserBase): j = match.end() match = endtagfind.match(rawdata, i) # </ + tag + > if not match: + if self.cdata_elem is not None: + self.handle_data(rawdata[i:j]) + return j self.error("bad end tag: %r" % (rawdata[i:j],)) - tag = match.group(1) - self.handle_endtag(tag.lower()) + + elem = match.group(1).lower() # script or style + if self.cdata_elem is not None: + if elem != self.cdata_elem: + self.handle_data(rawdata[i:j]) + return j + + self.handle_endtag(elem) self.clear_cdata_mode() return j |