summaryrefslogtreecommitdiff
path: root/examples/htmlTableParser.py
diff options
context:
space:
mode:
authorJon Dufresne <jon.dufresne@gmail.com>2019-10-31 21:10:28 -0700
committerPaul McGuire <ptmcg@users.noreply.github.com>2019-10-31 23:10:28 -0500
commit53d1b4a6f48a53c4c4ec4ac7031362b691c0366d (patch)
tree088ad3cf3561b78a00af4fb2fd474f4a2b8ca70c /examples/htmlTableParser.py
parent41752aa52cc97c710474bb2972cceab057b52ad4 (diff)
downloadpyparsing-git-53d1b4a6f48a53c4c4ec4ac7031362b691c0366d.tar.gz
Blacken the project (#141)
Diffstat (limited to 'examples/htmlTableParser.py')
-rw-r--r--examples/htmlTableParser.py48
1 files changed, 29 insertions, 19 deletions
diff --git a/examples/htmlTableParser.py b/examples/htmlTableParser.py
index 35cdd03..e96a913 100644
--- a/examples/htmlTableParser.py
+++ b/examples/htmlTableParser.py
@@ -11,42 +11,51 @@ import urllib.request
# define basic HTML tags, and compose into a Table
-table, table_end = pp.makeHTMLTags('table')
-thead, thead_end = pp.makeHTMLTags('thead')
-tbody, tbody_end = pp.makeHTMLTags('tbody')
-tr, tr_end = pp.makeHTMLTags('tr')
-th, th_end = pp.makeHTMLTags('th')
-td, td_end = pp.makeHTMLTags('td')
-a, a_end = pp.makeHTMLTags('a')
+table, table_end = pp.makeHTMLTags("table")
+thead, thead_end = pp.makeHTMLTags("thead")
+tbody, tbody_end = pp.makeHTMLTags("tbody")
+tr, tr_end = pp.makeHTMLTags("tr")
+th, th_end = pp.makeHTMLTags("th")
+td, td_end = pp.makeHTMLTags("td")
+a, a_end = pp.makeHTMLTags("a")
# method to strip HTML tags from a string - will be used to clean up content of table cells
strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString
# expression for parsing <a href="url">text</a> links, returning a (text, url) tuple
-link = pp.Group(a + a.tag_body('text') + a_end.suppress())
+link = pp.Group(a + a.tag_body("text") + a_end.suppress())
link.addParseAction(lambda t: (t[0].text, t[0].href))
# method to create table rows of header and data tags
def table_row(start_tag, end_tag):
body = start_tag.tag_body
- body.addParseAction(pp.tokenMap(str.strip),
- pp.tokenMap(strip_html))
- row = pp.Group(tr.suppress()
- + pp.ZeroOrMore(start_tag.suppress()
- + body
- + end_tag.suppress())
- + tr_end.suppress())
+ body.addParseAction(pp.tokenMap(str.strip), pp.tokenMap(strip_html))
+ row = pp.Group(
+ tr.suppress()
+ + pp.ZeroOrMore(start_tag.suppress() + body + end_tag.suppress())
+ + tr_end.suppress()
+ )
return row
+
th_row = table_row(th, th_end)
td_row = table_row(td, td_end)
# define expression for overall table - may vary slightly for different pages
-html_table = table + tbody + pp.Optional(th_row('headers')) + pp.ZeroOrMore(td_row)('rows') + tbody_end + table_end
+html_table = (
+ table
+ + tbody
+ + pp.Optional(th_row("headers"))
+ + pp.ZeroOrMore(td_row)("rows")
+ + tbody_end
+ + table_end
+)
# read in a web page containing an interesting HTML table
-with urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_tz_database_time_zones") as page:
+with urllib.request.urlopen(
+ "https://en.wikipedia.org/wiki/List_of_tz_database_time_zones"
+) as page:
page_html = page.read().decode()
tz_table = html_table.searchString(page_html)[0]
@@ -55,7 +64,8 @@ tz_table = html_table.searchString(page_html)[0]
rows = [dict(zip(tz_table.headers, row)) for row in tz_table.rows]
# make a dict keyed by TZ database name
-tz_db = {row['TZ database name']: row for row in rows}
+tz_db = {row["TZ database name"]: row for row in rows}
from pprint import pprint
-pprint(tz_db['America/Chicago'])
+
+pprint(tz_db["America/Chicago"])