summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjab <jab@math.brown.edu>2014-09-04 11:20:04 -0400
committerjab <jab@math.brown.edu>2014-09-04 13:39:08 -0400
commitfef78cdafefb61ed9114a4697b8a0630119ee6d7 (patch)
tree8cd5f5f65abf2d25b953be278fb9938c854abf97 /src
parent10572e15b3d7b85fe807e249ec4e5808ef5733e6 (diff)
downloadpython-lxml-fef78cdafefb61ed9114a4697b8a0630119ee6d7.tar.gz
lxml.html.document_fromstring ensure_head_body
When using lxml.html.document_fromstring to process html outside your control, you can't be sure it will have a head element or body element. Allowing document_fromstring to accept an ensure_head_body option saves you from having to write code like: doc = document_fromstring(html) try: doc.head except IndexError: doc.insert(0, Element('head')) # now we can safely reference doc.head You can instead just write: doc = document_fromstring(html, ensure_head_body=True)
Diffstat (limited to 'src')
-rw-r--r--src/lxml/html/__init__.py6
-rw-r--r--src/lxml/html/tests/test_basic.txt17
2 files changed, 22 insertions, 1 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index 556ab3e1..a3ad82a7 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -607,13 +607,17 @@ _looks_like_full_html_unicode = re.compile(
_looks_like_full_html_bytes = re.compile(
r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
-def document_fromstring(html, parser=None, **kw):
+def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
if parser is None:
parser = html_parser
value = etree.fromstring(html, parser, **kw)
if value is None:
raise etree.ParserError(
"Document is empty")
+ if ensure_head_body and value.find('head') is None:
+ value.insert(0, Element('head'))
+ if ensure_head_body and value.find('body') is None:
+ value.append(Element('body'))
return value
def fragments_fromstring(html, no_leading_text=False, base_url=None,
diff --git a/src/lxml/html/tests/test_basic.txt b/src/lxml/html/tests/test_basic.txt
index d7066402..ddbb0bf5 100644
--- a/src/lxml/html/tests/test_basic.txt
+++ b/src/lxml/html/tests/test_basic.txt
@@ -160,3 +160,20 @@ Bug 690319: Leading whitespace before doctype declaration should not raise an er
>>> print(tostring(etree_document, encoding=unicode))
<html></html>
+Feature https://github.com/lxml/lxml/pull/140: ensure_head_body option:
+
+ >>> from lxml.html import document_fromstring, tostring
+ >>> from functools import partial
+ >>> tos = partial(tostring, encoding=unicode)
+ >>> print(tos(document_fromstring('<p>test</p>')))
+ <html><body><p>test</p></body></html>
+ >>> print(tos(document_fromstring('<p>test</p>', ensure_head_body=True)))
+ <html><head></head><body><p>test</p></body></html>
+ >>> print(tos(document_fromstring('<meta>')))
+ <html><head><meta></head></html>
+ >>> print(tos(document_fromstring('<meta>', ensure_head_body=True)))
+ <html><head><meta></head><body></body></html>
+ >>> print(tos(document_fromstring('<html></html>')))
+ <html></html>
+ >>> print(tos(document_fromstring('<html></html>', ensure_head_body=True)))
+ <html><head></head><body></body></html>