summaryrefslogtreecommitdiff
path: root/compressor/parser/html5lib.py
blob: 0a96750360a05614ed970017b8ee3da95237337c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from django.core.exceptions import ImproperlyConfigured
from django.utils.encoding import smart_str
from django.utils.functional import cached_property

from compressor.exceptions import ParserError
from compressor.parser import ParserBase


class Html5LibParser(ParserBase):
    def __init__(self, content):
        super().__init__(content)
        import html5lib

        self.html5lib = html5lib

    def _serialize(self, elem):
        return self.html5lib.serialize(
            elem,
            tree="etree",
            quote_attr_values="always",
            omit_optional_tags=False,
        )

    def _find(self, *names):
        for elem in self.html:
            if elem.tag in names:
                yield elem

    @cached_property
    def html(self):
        try:
            return self.html5lib.parseFragment(self.content, treebuilder="etree")
        except ImportError as err:
            raise ImproperlyConfigured("Error while importing html5lib: %s" % err)
        except Exception as err:
            raise ParserError("Error while initializing Parser: %s" % err)

    def css_elems(self):
        return self._find(
            "{http://www.w3.org/1999/xhtml}link", "{http://www.w3.org/1999/xhtml}style"
        )

    def js_elems(self):
        return self._find("{http://www.w3.org/1999/xhtml}script")

    def elem_attribs(self, elem):
        return elem.attrib

    def elem_content(self, elem):
        return smart_str(elem.text)

    def elem_name(self, elem):
        if "}" in elem.tag:
            return elem.tag.split("}")[1]
        return elem.tag

    def elem_str(self, elem):
        # This method serializes HTML in a way that does not pass all tests.
        # However, this method is only called in tests anyway, so it doesn't
        # really matter.
        return smart_str(self._serialize(elem))