diff options
Diffstat (limited to 'Tools/scripts/parse_html5_entities.py')
| -rwxr-xr-x | Tools/scripts/parse_html5_entities.py | 114 | 
1 files changed, 0 insertions, 114 deletions
| diff --git a/Tools/scripts/parse_html5_entities.py b/Tools/scripts/parse_html5_entities.py deleted file mode 100755 index 1e5bdad216..0000000000 --- a/Tools/scripts/parse_html5_entities.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -""" -Utility for parsing HTML5 entity definitions available from: - -    https://html.spec.whatwg.org/entities.json -    https://html.spec.whatwg.org/multipage/named-characters.html - -The page now contains the following note: - -    "This list is static and will not be expanded or changed in the future." - -Written by Ezio Melotti and Iuliia Proskurnia. -""" - -import os -import sys -import json -from urllib.request import urlopen -from html.entities import html5 - -PAGE_URL = 'https://html.spec.whatwg.org/multipage/named-characters.html' -ENTITIES_URL = 'https://html.spec.whatwg.org/entities.json' -HTML5_SECTION_START = '# HTML5 named character references' - -def get_json(url): -    """Download the json file from the url and returns a decoded object.""" -    with urlopen(url) as f: -        data = f.read().decode('utf-8') -    return json.loads(data) - -def create_dict(entities): -    """Create the html5 dict from the decoded json object.""" -    new_html5 = {} -    for name, value in entities.items(): -        new_html5[name.lstrip('&')] = value['characters'] -    return new_html5 - -def compare_dicts(old, new): -    """Compare the old and new dicts and print the differences.""" -    added = new.keys() - old.keys() -    if added: -        print('{} entitie(s) have been added:'.format(len(added))) -        for name in sorted(added): -            print('  {!r}: {!r}'.format(name, new[name])) -    removed = old.keys() - new.keys() -    if removed: -        print('{} entitie(s) have been removed:'.format(len(removed))) -        for name in sorted(removed): -            print('  {!r}: {!r}'.format(name, old[name])) -    changed = set() -    for name in (old.keys() & new.keys()): -        if old[name] != new[name]: -            changed.add((name, old[name], new[name])) -    if changed: -        print('{} entitie(s) have been modified:'.format(len(changed))) -        for item in sorted(changed): -            print('  {!r}: {!r} -> {!r}'.format(*item)) - -def write_items(entities, file=sys.stdout): -    """Write the items of the dictionary in the specified file.""" -    # The keys in the generated dictionary should be sorted -    # in a case-insensitive way, however, when two keys are equal, -    # the uppercase version should come first so that the result -    # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...] -    # To do this we first sort in a case-sensitive way (so all the -    # uppercase chars come first) and then sort with key=str.lower. -    # Since the sorting is stable the uppercase keys will eventually -    # be before their equivalent lowercase version. -    keys = sorted(entities.keys()) -    keys = sorted(keys, key=str.lower) -    print(HTML5_SECTION_START, file=file) -    print(f'# Generated by {sys.argv[0]!r}\n' -          f'# from {ENTITIES_URL} and\n' -          f'# {PAGE_URL}.\n' -          f'# Map HTML5 named character references to the ' -          f'equivalent Unicode character(s).', file=file) -    print('html5 = {', file=file) -    for name in keys: -        print(f'    {name!r}: {entities[name]!a},', file=file) -    print('}', file=file) - - -if __name__ == '__main__': -    # without args print a diff between html.entities.html5 and new_html5 -    # with --create print the new html5 dict -    # with --patch patch the Lib/html/entities.py file -    new_html5 = create_dict(get_json(ENTITIES_URL)) -    if '--create' in sys.argv: -        write_items(new_html5) -    elif '--patch' in sys.argv: -        fname = 'Lib/html/entities.py' -        temp_fname = fname + '.temp' -        with open(fname) as f1, open(temp_fname, 'w') as f2: -            skip = False -            for line in f1: -                if line.startswith(HTML5_SECTION_START): -                    write_items(new_html5, file=f2) -                    skip = True -                    continue -                if skip: -                    # skip the old items until the } -                    if line.startswith('}'): -                        skip = False -                    continue -                f2.write(line) -        os.remove(fname) -        os.rename(temp_fname, fname) -    else: -        if html5 == new_html5: -            print('The current dictionary is updated.') -        else: -            compare_dicts(html5, new_html5) -            print('Run "./python {0} --patch" to update Lib/html/entities.html ' -                  'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__)) | 
