diff options
| author | jab <jab@math.brown.edu> | 2014-09-04 11:20:04 -0400 |
|---|---|---|
| committer | jab <jab@math.brown.edu> | 2014-09-04 13:39:08 -0400 |
| commit | fef78cdafefb61ed9114a4697b8a0630119ee6d7 (patch) | |
| tree | 8cd5f5f65abf2d25b953be278fb9938c854abf97 /src | |
| parent | 10572e15b3d7b85fe807e249ec4e5808ef5733e6 (diff) | |
| download | python-lxml-fef78cdafefb61ed9114a4697b8a0630119ee6d7.tar.gz | |
lxml.html.document_fromstring ensure_head_body
When using lxml.html.document_fromstring to process html outside your control,
you can't be sure it will have a head element or body element. Allowing
document_fromstring to accept an ensure_head_body option saves you from having
to write code like:
doc = document_fromstring(html)
try:
doc.head
except IndexError:
doc.insert(0, Element('head'))
# now we can safely reference doc.head
You can instead just write:
doc = document_fromstring(html, ensure_head_body=True)
Diffstat (limited to 'src')
| -rw-r--r-- | src/lxml/html/__init__.py | 6 | ||||
| -rw-r--r-- | src/lxml/html/tests/test_basic.txt | 17 |
2 files changed, 22 insertions, 1 deletions
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py index 556ab3e1..a3ad82a7 100644 --- a/src/lxml/html/__init__.py +++ b/src/lxml/html/__init__.py @@ -607,13 +607,17 @@ _looks_like_full_html_unicode = re.compile( _looks_like_full_html_bytes = re.compile( r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match -def document_fromstring(html, parser=None, **kw): +def document_fromstring(html, parser=None, ensure_head_body=False, **kw): if parser is None: parser = html_parser value = etree.fromstring(html, parser, **kw) if value is None: raise etree.ParserError( "Document is empty") + if ensure_head_body and value.find('head') is None: + value.insert(0, Element('head')) + if ensure_head_body and value.find('body') is None: + value.append(Element('body')) return value def fragments_fromstring(html, no_leading_text=False, base_url=None, diff --git a/src/lxml/html/tests/test_basic.txt b/src/lxml/html/tests/test_basic.txt index d7066402..ddbb0bf5 100644 --- a/src/lxml/html/tests/test_basic.txt +++ b/src/lxml/html/tests/test_basic.txt @@ -160,3 +160,20 @@ Bug 690319: Leading whitespace before doctype declaration should not raise an er >>> print(tostring(etree_document, encoding=unicode)) <html></html> +Feature https://github.com/lxml/lxml/pull/140: ensure_head_body option: + + >>> from lxml.html import document_fromstring, tostring + >>> from functools import partial + >>> tos = partial(tostring, encoding=unicode) + >>> print(tos(document_fromstring('<p>test</p>'))) + <html><body><p>test</p></body></html> + >>> print(tos(document_fromstring('<p>test</p>', ensure_head_body=True))) + <html><head></head><body><p>test</p></body></html> + >>> print(tos(document_fromstring('<meta>'))) + <html><head><meta></head></html> + >>> print(tos(document_fromstring('<meta>', ensure_head_body=True))) + <html><head><meta></head><body></body></html> + >>> print(tos(document_fromstring('<html></html>'))) + <html></html> + >>> print(tos(document_fromstring('<html></html>', ensure_head_body=True))) + <html><head></head><body></body></html> |
