blob: 3b771c756bff24fa08c84acd49e5d0760707e27d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import urllib.request, urllib.parse, urllib.error
from pyparsing import makeHTMLTags, SkipTo
# read HTML from a web page
serverListPage = urllib.request.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()
# using makeHTMLTags to define opening and closing tags
anchorStart,anchorEnd = makeHTMLTags("a")
# compose an expression for an anchored reference
anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd
# use scanString to scan through the HTML source, extracting
# just the anchor tags and their associated body text
# (note the href attribute of the opening A tag is available
# as an attribute in the returned parse results)
for tokens,start,end in anchor.scanString(htmlText):
print(tokens.body,'->',tokens.href)
|