src/lxml/html/tests/test_elementsoup.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

import unittest, sys
from lxml.tests.common_imports import make_doctest, HelperTestCase

try:
    import lxml.html.soupparser
    BS_INSTALLED = True
except ImportError:
    if 'bs4' in sys.modules or 'BeautifulSoup' in sys.modules:
        raise  # seems we managed to import BS but not soupparser
    BS_INSTALLED = False

from lxml.html import tostring


if BS_INSTALLED:
    class SoupParserTestCase(HelperTestCase):
        soupparser = lxml.html.soupparser

        def test_broken_attribute(self):
            html = """\
              <html><head></head><body>
                <form><input type='text' disabled size='10'></form>
              </body></html>
            """
            root = self.soupparser.fromstring(html)
            self.assertTrue(root.find('.//input').get('disabled') is not None)

        def test_empty(self):
            tree = self.soupparser.fromstring('')
            res = b'''<html></html>'''
            self.assertEqual(tostring(tree), res)

        def test_text(self):
            tree = self.soupparser.fromstring('huhu')
            res = b'''<html>huhu</html>'''
            self.assertEqual(tostring(tree), res)

        def test_body(self):
            html = '''<body><p>test</p></body>'''
            res = b'''<html><body><p>test</p></body></html>'''
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)

        def test_head_body(self):
            # HTML tag missing, parser should fix that
            html = '<head><title>test</title></head><body><p>test</p></body>'
            res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)

        def test_wrap_html(self):
            # <head> outside <html>, parser should fix that
            html = '<head><title>title</test></head><html><body/></html>'
            res = b'<html><head><title>title</title></head><body></body></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), res)

        def test_comment_hyphen(self):
            # These are really invalid XML as per specification
            # https://www.w3.org/TR/REC-xml/#sec-comments
            html = b'<html><!-- comment -- with double-hyphen --></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), html)

            html = b'<html><!-- comment ends with hyphen ---></html>'
            tree = self.soupparser.fromstring(html)
            self.assertEqual(tostring(tree), html)

        def test_comment_pi(self):
            html = '''<!-- comment -->
<?test asdf?>
<head><title>test</title></head><body><p>test</p></body>
<!-- another comment -->'''
            res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tostring(tree, method='html'), res)

        def test_doctype1(self):
            # Test document type declaration, comments and PI's
            # outside the root
            html = \
'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''

            res = \
b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tree.docinfo.public_id, "-//W3C//DTD HTML 4.01//EN")
            self.assertEqual(tostring(tree), res)

        def test_doctype2(self):
            # Test document type declaration, comments and PI's
            # outside the root
            html = \
'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            res = \
b'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertEqual(tree.docinfo.public_id, "-//IETF//DTD HTML//EN")
            self.assertEqual(tostring(tree), res)

        def test_doctype_html5(self):
            # html 5 doctype declaration
            html = b'<!DOCTYPE html>\n<html lang="en"></html>'

            tree = self.soupparser.fromstring(html).getroottree()
            self.assertTrue(tree.docinfo.public_id is None)
            self.assertEqual(tostring(tree), html)


def test_suite():
    suite = unittest.TestSuite()
    if BS_INSTALLED:
        suite.addTests([unittest.makeSuite(SoupParserTestCase)])
        if sys.version_info[0] < 3:
            suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
    return suite


if __name__ == '__main__':
    unittest.main()