summaryrefslogtreecommitdiff
path: root/src/lxml/html/tests/test_basic.txt
blob: 5a59d3d004549b61f126d00b373847a6fe21bfe6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
lxml.html adds a find_class method to elements::

    >>> from lxml.etree import Comment
    >>> from lxml.html import document_fromstring, fragment_fromstring, tostring
    >>> from lxml.html.clean import clean, clean_html
    >>> from lxml.html import usedoctest
    >>> try: unicode = __builtins__["unicode"]
    ... except (KeyError, NameError): unicode = str

    >>> h = document_fromstring('''
    ... <html><head></head>
    ... <body>
    ...   <a class="vcard
    ... fn   url" href="foobar">P1</a>
    ...   <a class="not-fn vcard" href="baz">P2</a>
    ... </body></html>''')
    >>> print(tostring(h, encoding=unicode))
    <html>
      <head></head>
      <body>
        <a class="vcard
    fn   url" href="foobar">P1</a>
        <a class="not-fn vcard" href="baz">P2</a>
      </body>
    </html>
    >>> print([e.text for e in h.find_class('fn')])
    ['P1']
    >>> print([e.text for e in h.find_class('vcard')])
    ['P1', 'P2']

Also added is a get_rel_links, which you can use to search for links
like ``<a rel="$something">``::

    >>> h = document_fromstring('''
    ... <a href="1">test 1</a>
    ... <a href="2" rel="tag">item 2</a>
    ... <a href="3" rel="tagging">item 3</a>
    ... <a href="4" rel="TAG">item 4</a>''')
    >>> print([e.attrib['href'] for e in h.find_rel_links('tag')])
    ['2', '4']
    >>> print([e.attrib['href'] for e in h.find_rel_links('nofollow')])
    []

Another method is ``get_element_by_id`` that does what it says::

    >>> print(tostring(fragment_fromstring('''
    ... <div>
    ...  <span id="test">stuff</span>
    ... </div>''').get_element_by_id('test'), encoding=unicode))
    <span id="test">stuff</span>

Or to get the content of an element without the tags, use text_content()::

    >>> el = fragment_fromstring('''
    ... <div>This is <a href="foo">a <b>bold</b> link</a></div>''')
    >>> el.text_content()
    'This is a bold link'

Or drop an element (leaving its content) or the entire tree, like::

    >>> doc = document_fromstring('''
    ... <html>
    ...  <body>
    ...   <div id="body">
    ...    This is a <a href="foo" id="link">test</a> of stuff.
    ...   </div>
    ...   <!-- a comment -->
    ...   <div>footer</div>
    ...  </body>
    ... </html>''')
    >>> doc.get_element_by_id('link').drop_tag()
    >>> print(tostring(doc, encoding=unicode))
    <html>
     <body>
      <div id="body">
       This is a test of stuff.
      </div>
      <!-- a comment -->
      <div>footer</div>
     </body>
    </html>
    >>> doc.get_element_by_id('body').drop_tree()
    >>> print(tostring(doc, encoding=unicode))
    <html>
     <body>
      <!-- a comment -->
      <div>footer</div>
     </body>
    </html>

Note, however, that comment text will not be merged into the tree when you
drop the comment.  Here, ``drop_tag()`` behaves exactly like ``drop_tree()``:

    >>> for comment in doc.getiterator(Comment):
    ...     comment.drop_tag()
    >>> print(tostring(doc, encoding=unicode))
    <html>
     <body>
      <div>footer</div>
     </body>
    </html>