summaryrefslogtreecommitdiff
path: root/src/lxml/html
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2017-08-12 17:02:12 +0200
committerStefan Behnel <stefan_ml@behnel.de>2017-08-12 17:02:12 +0200
commitdb896e145430b572bf3fbab38dcb4dd5a1fdf3bc (patch)
tree66a5919f3949d9fcf5b2998e92d0945fff91d3ec /src/lxml/html
parent1ff0b9364b758679ffb2612379ad589a1c1e6aac (diff)
downloadpython-lxml-db896e145430b572bf3fbab38dcb4dd5a1fdf3bc.tar.gz
Only pass "useChardet" option into html5parser by default if the input is a byte string. Should help with LP#1654544.
Diffstat (limited to 'src/lxml/html')
-rw-r--r--src/lxml/html/html5parser.py24
1 files changed, 18 insertions, 6 deletions
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
index 99d8c039..7c101af9 100644
--- a/src/lxml/html/html5parser.py
+++ b/src/lxml/html/html5parser.py
@@ -32,14 +32,14 @@ def _dodgeUseChardet(fn):
# detect the argument to be unicode (all of that code is private),
# so we'll have to settle for a retry.
- # this decorator wraps around the a method, which is retried when html5lib
+ # this decorator wraps a method, which is retried when html5lib
# complains about the useChardet argument
@functools.wraps(fn)
def inner(*args, **kwargs):
try:
return fn(*args, **kwargs)
except TypeError as exception:
- if "'useChardet'" not in str(exception):
+ if 'useChardet' not in kwargs or "'useChardet'" not in str(exception):
# Some other issue caused the exception. Tell the caller
raise
kwargs.pop('useChardet')
@@ -86,7 +86,7 @@ def _find_tag(tree, tag):
return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
-def document_fromstring(html, guess_charset=True, parser=None):
+def document_fromstring(html, guess_charset=None, parser=None):
"""Parse a whole document into a string."""
if not isinstance(html, _strings):
raise TypeError('string required')
@@ -94,11 +94,17 @@ def document_fromstring(html, guess_charset=True, parser=None):
if parser is None:
parser = html_parser
- return parser.parse(html, useChardet=guess_charset).getroot()
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # FIXME: I have no idea why the default differs from fragments_fromstring()
+ guess_charset = True
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ return parser.parse(html, **options).getroot()
def fragments_fromstring(html, no_leading_text=False,
- guess_charset=False, parser=None):
+ guess_charset=None, parser=None):
"""Parses several HTML elements, returning a list of elements.
The first item in the list may be a string. If no_leading_text is true,
@@ -115,7 +121,13 @@ def fragments_fromstring(html, no_leading_text=False,
if parser is None:
parser = html_parser
- children = parser.parseFragment(html, 'div', useChardet=guess_charset)
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # FIXME: I have no idea why the default differs from document_fromstring()
+ guess_charset = False
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ children = parser.parseFragment(html, 'div', **options)
if children and isinstance(children[0], _strings):
if no_leading_text:
if children[0].strip():