# # htmlTableParser.py # # Example of parsing a simple HTML table into a list of rows, and optionally into a little database # # Copyright 2019, Paul McGuire # import pyparsing as pp import urllib.request # define basic HTML tags, and compose into a Table table, table_end = pp.makeHTMLTags("table") thead, thead_end = pp.makeHTMLTags("thead") tbody, tbody_end = pp.makeHTMLTags("tbody") tr, tr_end = pp.makeHTMLTags("tr") th, th_end = pp.makeHTMLTags("th") td, td_end = pp.makeHTMLTags("td") a, a_end = pp.makeHTMLTags("a") # method to strip HTML tags from a string - will be used to clean up content of table cells strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString # expression for parsing text links, returning a (text, url) tuple link = pp.Group(a + a.tag_body("text") + a_end.suppress()) def extract_text_and_url(t): return (t[0].text, t[0].href) link.addParseAction(extract_text_and_url) # method to create table rows of header and data tags def table_row(start_tag, end_tag): body = start_tag.tag_body body.addParseAction(pp.tokenMap(str.strip), pp.tokenMap(strip_html)) row = pp.Group( tr.suppress() + pp.ZeroOrMore(start_tag.suppress() + body + end_tag.suppress()) + tr_end.suppress() ) return row th_row = table_row(th, th_end) td_row = table_row(td, td_end) # define expression for overall table - may vary slightly for different pages html_table = ( table + tbody + pp.Optional(th_row("headers")) + pp.ZeroOrMore(td_row)("rows") + tbody_end + table_end ) # read in a web page containing an interesting HTML table with urllib.request.urlopen( "https://en.wikipedia.org/wiki/List_of_tz_database_time_zones" ) as page: page_html = page.read().decode() tz_table = html_table.searchString(page_html)[0] # convert rows to dicts rows = [dict(zip(tz_table.headers, row)) for row in tz_table.rows] # make a dict keyed by TZ database name tz_db = {row["TZ database name"]: row for row in rows} from pprint import pprint pprint(tz_db["America/Chicago"])