Add example showing scraping/parsing of an HTML table into a Python dict

author: Paul McGuire <ptmcg@austin.rr.com> 2019-03-30 03:00:27 -0500
committer: Paul McGuire <ptmcg@austin.rr.com> 2019-03-30 03:00:27 -0500
commit: abd05378eb6acf742f2deff4228a0bca4492521b (patch)
tree: 6372ed902de077ba7704b3c7a94f1b38e5473b4e /examples/htmlTableParser.py
parent: bb9d1255548b46dc2ba7a85e26606b7dd4c926f3 (diff)
download: pyparsing-git-abd05378eb6acf742f2deff4228a0bca4492521b.tar.gz
1 files changed, 61 insertions, 0 deletions
diff --git a/examples/htmlTableParser.py b/examples/htmlTableParser.py
new file mode 100644
index 0000000..5fe8e5f
--- /dev/null
+++ b/examples/htmlTableParser.py
@@ -0,0 +1,61 @@
+#
+# htmlTableParser.py
+#
+# Example of parsing a simple HTML table into a list of rows, and optionally into a little database
+#
+# Copyright 2019, Paul McGuire
+#
+
+import pyparsing as pp
+import urllib.request
+
+
+# define basic HTML tags, and compose into a Table
+table, table_end = pp.makeHTMLTags('table')
+thead, thead_end = pp.makeHTMLTags('thead')
+tbody, tbody_end = pp.makeHTMLTags('tbody')
+tr, tr_end = pp.makeHTMLTags('tr')
+th, th_end = pp.makeHTMLTags('th')
+td, td_end = pp.makeHTMLTags('td')
+a, a_end = pp.makeHTMLTags('a')
+
+# method to strip HTML tags from a string - will be used to clean up content of table cells
+strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString
+
+# expression for parsing <a href="url">text</a> links, returning a (text, url) tuple
+link = pp.Group(a + pp.SkipTo(a_end)('text') + a_end.suppress())
+link.addParseAction(lambda t: (t[0].text, t[0].href))
+
+# method to create table rows of header and data tags
+def table_row(start_tag, end_tag):
+    body = pp.SkipTo(end_tag)
+    body.addParseAction(pp.tokenMap(str.strip),
+                        pp.tokenMap(strip_html))
+    row = pp.Group(tr.suppress()
+                   + pp.ZeroOrMore(start_tag.suppress()
+                                   + body
+                                   + end_tag.suppress())
+                   + tr_end.suppress())
+    return row
+
+th_row = table_row(th, th_end)
+td_row = table_row(td, td_end)
+
+# define expression for overall table - may vary slightly for different pages
+html_table = table + tbody + pp.Optional(th_row('headers')) + pp.ZeroOrMore(td_row)('rows') + tbody_end + table_end
+
+
+# read in a web page containing an interesting HTML table
+with urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_tz_database_time_zones") as page:
+    page_html = page.read().decode()
+
+tz_table = html_table.searchString(page_html)[0]
+
+# convert rows to dicts
+rows = [dict(zip(tz_table.headers, row)) for row in tz_table.rows]
+
+# make a dict keyed by TZ database name
+tz_db = {row['TZ database name']: row for row in rows}
+
+from pprint import pprint
+pprint(tz_db['America/Chicago'])
author	Paul McGuire <ptmcg@austin.rr.com>	2019-03-30 03:00:27 -0500
committer	Paul McGuire <ptmcg@austin.rr.com>	2019-03-30 03:00:27 -0500
commit	abd05378eb6acf742f2deff4228a0bca4492521b (patch)
tree	6372ed902de077ba7704b3c7a94f1b38e5473b4e /examples/htmlTableParser.py
parent	bb9d1255548b46dc2ba7a85e26606b7dd4c926f3 (diff)
download	pyparsing-git-abd05378eb6acf742f2deff4228a0bca4492521b.tar.gz