summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEzio Melotti <ezio.melotti@gmail.com>2012-02-15 12:44:23 +0200
committerEzio Melotti <ezio.melotti@gmail.com>2012-02-15 12:44:23 +0200
commitd2307cb48ab09baa846947c5c2c4001dce9b6e52 (patch)
tree5667706edf910500c90d351f4114e2b97d1c76df
parentfd7e4964bbe8dcd750c46aa2a96aeaec97e7ef25 (diff)
downloadcpython-git-d2307cb48ab09baa846947c5c2c4001dce9b6e52.tar.gz
#13987: HTMLParser is now able to handle EOFs in the middle of a construct.
-rw-r--r--Lib/HTMLParser.py13
-rw-r--r--Lib/test/test_htmlparser.py16
-rw-r--r--Misc/NEWS3
3 files changed, 21 insertions, 11 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index f230c5f163..d2268d02cd 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -170,9 +170,16 @@ class HTMLParser(markupbase.ParserBase):
else:
break
if k < 0:
- if end:
- self.error("EOF in middle of construct")
- break
+ if not end:
+ break
+ k = rawdata.find('>', i + 1)
+ if k < 0:
+ k = rawdata.find('<', i + 1)
+ if k < 0:
+ k = i + 1
+ else:
+ k += 1
+ self.handle_data(rawdata[i:k])
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 6667512785..ba775abdac 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -204,16 +204,16 @@ text
def test_starttag_junk_chars(self):
self._run_check("</>", [])
self._run_check("</$>", [('comment', '$')])
- self._parse_error("</")
- self._parse_error("</a")
+ self._run_check("</", [('data', '</')])
+ self._run_check("</a", [('data', '</a')])
self._parse_error("<a<a>")
self._run_check("</a<a>", [('endtag', 'a<a')])
- self._parse_error("<!")
- self._parse_error("<a")
- self._parse_error("<a foo='bar'")
- self._parse_error("<a foo='bar")
- self._parse_error("<a foo='>'")
- self._parse_error("<a foo='>")
+ self._run_check("<!", [('data', '<!')])
+ self._run_check("<a", [('data', '<a')])
+ self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
+ self._run_check("<a foo='bar", [('data', "<a foo='bar")])
+ self._run_check("<a foo='>'", [('data', "<a foo='>'")])
+ self._run_check("<a foo='>", [('data', "<a foo='>")])
def test_valid_doctypes(self):
# from http://www.w3.org/QA/2002/04/valid-dtd-list.html
diff --git a/Misc/NEWS b/Misc/NEWS
index ba09480640..825752134a 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -93,6 +93,9 @@ Core and Builtins
Library
-------
+- Issue #13987: HTMLParser is now able to handle EOFs in the middle of a
+ construct.
+
- Issue #13015: Fix a possible reference leak in defaultdict.__repr__.
Patch by Suman Saha.