diff options
| author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2017-01-17 15:06:17 +0000 |
|---|---|---|
| committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2017-01-17 15:06:17 +0000 |
| commit | f06ddb05d28b578df8560d3e96475f2b924e503c (patch) | |
| tree | 36458f4af6da46f7893a72d375207cac120fd2b1 /docutils/tools/dev | |
| parent | eca3ae63b963b8183831c2af653f61274df13dcc (diff) | |
| download | docutils-f06ddb05d28b578df8560d3e96475f2b924e503c.tar.gz | |
Generate the complete punctuation_chars module with the corresponding tool.
git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@8016 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
Diffstat (limited to 'docutils/tools/dev')
| -rw-r--r-- | docutils/tools/dev/generate_punctuation_chars.py | 250 |
1 files changed, 190 insertions, 60 deletions
diff --git a/docutils/tools/dev/generate_punctuation_chars.py b/docutils/tools/dev/generate_punctuation_chars.py index eb1c73372..504780a13 100644 --- a/docutils/tools/dev/generate_punctuation_chars.py +++ b/docutils/tools/dev/generate_punctuation_chars.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# :Copyright: © 2011, 2016 Günter Milde. +# :Copyright: © 2011, 2017 Günter Milde. # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, @@ -14,37 +14,131 @@ # # :: -import sys, re -import unicodedata - -# import the punctuation_chars module from the source or Py3k build -# path for local Python modules -if sys.version_info < (3,): - sys.path.insert(0, '../../docutils') -else: - sys.path.insert(0, '../../build/lib') - unichr = chr - -from docutils.utils.punctuation_chars import (openers, closers, delimiters, - closing_delimiters) +"""(Re)generate the utils.punctuation_chars module.""" # (re)generate the utils.punctuation_chars module # =============================================== # -# The category of some characters may change with the development of the +# The category of some characters can change with the development of the # Unicode standard. This tool checks the patterns in `utils.punctuation_chars` # against a re-calculation based on the "unicodedata" stdlib module # which may give different results for different Python versions. # -# Updating the patterns with a new (Python|Unicode standard) version is an API -# change (may render valid rST documents invalid). It should only be done for -# "feature releases" and requires also updating the specification of `inline -# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt. +# Updating the module with changed `unicode_punctuation_categories` (due to +# a new Python or Unicode standard version is an API cange (may render valid +# rST documents invalid). It should only be done for "feature releases" and +# requires also updating the specification of `inline markup recognition +# rules`_ in ../../docs/ref/rst/restructuredtext.txt. +# +# .. _inline markup recognition rules: +# ../../docs/ref/rst/restructuredtext.html#inline-markup + + +# Setup:: + +import sys, re +import unicodedata + +if sys.version_info >= (3,): + unichr = chr # unichr not available in Py3k +else: + import codecs + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + + +# Template for utils.punctuation_chars +# ------------------------------------ # +# Problem: ``ur`` prefix fails with Py 3.5 :: + +module_template = u'''#!/usr/bin/env python +# -*- coding: utf-8 -*- +# :Id: $Id$ +# :Copyright: © 2011, 2017 Günter Milde. +# :License: Released under the terms of the `2-Clause BSD license`_, in short: +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. +# This file is offered as-is, without any warranty. +# +# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause +# +# This file is generated by +# ``docutils/tools/dev/generate_punctuation_chars.py``. +# :: + +import sys, re +import unicodedata + +"""Docutils character category patterns. + + Patterns for the implementation of the `inline markup recognition rules`_ + in the reStructuredText parser `docutils.parsers.rst.states.py` based + on Unicode character categories. + The patterns are used inside ``[ ]`` in regular expressions. + + Rule (5) requires determination of matching open/close pairs. However, the + pairing of open/close quotes is ambiguous due to different typographic + conventions in different languages. The ``quote_pairs`` function tests + whether two characters form an open/close pair. + + The patterns are generated by + ``docutils/tools/dev/generate_punctuation_chars.py`` to prevent dependence + on the Python version and avoid the time-consuming generation with every + Docutils run. See there for motives and implementation details. + + The category of some characters changed with the development of the + Unicode standard. The current lists are generated with the help of the + "unicodedata" module of Python %(python_version)s (based on Unicode version %(unidata_version)s). + + .. _inline markup recognition rules: + http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules +""" + +%(openers)s +%(closers)s +%(delimiters)s +if sys.maxunicode >= 0x10FFFF: # "wide" build +%(delimiters_wide)s +closing_delimiters = u'\\\\\\\\.,;!?' + + +# Matching open/close quotes +# -------------------------- + +quote_pairs = {# open char: matching closing characters # usage example + u'\\xbb': u'\\xbb', # » » Swedish + u'\\u2018': u'\\u201a', # ‘ ‚ Albanian/Greek/Turkish + u'\\u2019': u'\\u2019', # ’ ’ Swedish + u'\\u201a': u'\\u2018\\u2019', # ‚ ‘ German ‚ ’ Polish + u'\\u201c': u'\\u201e', # “ „ Albanian/Greek/Turkish + u'\\u201e': u'\\u201c\\u201d', # „ “ German „ ” Polish + u'\\u201d': u'\\u201d', # ” ” Swedish + u'\\u203a': u'\\u203a', # › › Swedish + } +"""Additional open/close quote pairs.""" + +def match_chars(c1, c2): + """Test whether `c1` and `c2` are a matching open/close character pair. + + Matching open/close pairs are at the same position in + `punctuation_chars.openers` and `punctuation_chars.closers`. + The pairing of open/close quotes is ambiguous due to different + typographic conventions in different languages, + so we test for additional matches stored in `quote_pairs`. + """ + try: + i = openers.index(c1) + except ValueError: # c1 not in openers + return False + return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')\ +''' + + # Generation of the character category patterns # ---------------------------------------------- # -# # Unicode punctuation character categories # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # @@ -160,11 +254,6 @@ def character_category_patterns(): # non-matching, after markup closing_delimiters = [r'\\.,;!?'] - # # Test open/close matching: - # for i in range(min(len(openers),len(closers))): - # print('%4d %s %s' % (i, openers[i].encode('utf8'), - # closers[i].encode('utf8')) - return [u''.join(chars) for chars in (openers, closers, delimiters, closing_delimiters)] @@ -202,7 +291,7 @@ def mark_intervals(s): return ''.join(l2) def wrap_string(s, startstring= "(u'", - endstring = "')", wrap=65): + endstring = "')", wrap=67): """Line-wrap a unicode string literal definition.""" c = len(startstring) contstring = "'\n" + ' ' * (len(startstring)-2) + "u'" @@ -228,6 +317,18 @@ def print_differences(old, new, name): for c in old: if c not in new: print(' %04x'%ord(c), unicodedata.name(c)) + else: + print('%s unchanged' % name) + +def print_quote_pairs(): + pairs = [(o,c) for o,c in quote_pairs.items()] + for o,c in sorted(pairs): + print((u'%s %s' % (o,c)).encode('utf8')) + + # # Test open/close matching: + # for i in range(min(len(openers),len(closers))): + # print('%4d %s %s' % (i, openers[i].encode('utf8'), + # closers[i].encode('utf8')) # Output @@ -237,60 +338,89 @@ def print_differences(old, new, name): if __name__ == '__main__': -# (Re)create and compare character patterns -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + import argparse + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-t', '--test', action="store_true", + help='test for changed character categories') + args = parser.parse_args() + +# (Re)create character patterns +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # :: (o, c, d, cd) = character_category_patterns() + +# Characters in the upper plane require a "wide" build:: + o, o_wide = separate_wide_chars(o) c, c_wide = separate_wide_chars(c) d, d_wide = separate_wide_chars(d) + +# delimiters: sort and use shortcut for intervals (saves ~150 characters) +# (`openers` and `closers` must be verbose and keep order +# because they are also used in `match_chars()`):: + d = d[:5] + mark_intervals(d[5:]) d_wide = mark_intervals(d_wide) - print_differences(openers, o, 'openers') - if o_wide: - print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8')) - print_differences(closers, c, 'closers') - if c_wide: - print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8')) - print_differences(delimiters, d + d_wide, 'delimiters') - print_differences(closing_delimiters, cd, 'closing_delimiters') - -# Print literal code to define the character sets -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# This code can be copied to punctuation_chars.py if an update is wanted. - -# Unicode version:: +# Test: compare module content with re-generated definitions +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# :: - print('# based on Unicode version %s' % unicodedata.unidata_version) + if args.test: -# `openers` and `closers` must be verbose and keep order because they are -# also used in `match_chars()`:: +# Import the punctuation_chars module from the source +# or Py3k build path for local Python modules:: - print(wrap_string(o.encode('unicode-escape').decode(), - startstring="openers = (u'")) - print(wrap_string(c.encode('unicode-escape').decode(), - startstring="closers = (u'")) + if sys.version_info < (3,): + sys.path.insert(0, '../../docutils') + else: + sys.path.insert(0, '../../build/lib') -# delimiters: sort and use shortcut for intervals (saves ~150 characters):: + from docutils.utils.punctuation_chars import (openers, closers, + delimiters, closing_delimiters) - print(wrap_string(d.encode('unicode-escape').decode(), - startstring="delimiters = (u'")) + print('Check for differences between the current `punctuation_chars`' + ' module\n and a regeneration based on Unicode version %s:' + % unicodedata.unidata_version) -# add characters in the upper plane only in a "wide" build:: + print_differences(openers, o, 'openers') + if o_wide: + print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8')) + print_differences(closers, c, 'closers') + if c_wide: + print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8')) - print('if sys.maxunicode >= 0x10FFFF: # "wide" build') - print(wrap_string(d_wide.encode('unicode-escape').decode(), - startstring=" delimiters += (u'")) + print_differences(delimiters, d + d_wide, 'delimiters') + print_differences(closing_delimiters, cd, 'closing_delimiters') -# additional closing delimiters:: + sys.exit() - print(wrap_string(cd.encode('unicode-escape').decode(), - startstring="closing_delimiters = (u'")) +# Print re-generation of the punctuation_chars module +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The output can be copied to docutils/utils if an update is wanted +# (API change, see Intro). + +# Replacements:: + + substitutions = { + 'python_version': '.'.join(str(s) for s in sys.version_info[:3]), + 'unidata_version': unicodedata.unidata_version, + 'openers': wrap_string(o.encode('unicode-escape').decode(), + startstring="openers = (u'"), + 'closers': wrap_string(c.encode('unicode-escape').decode(), + startstring="closers = (u'"), + 'delimiters': wrap_string(d.encode('unicode-escape').decode(), + startstring="delimiters = (u'"), + 'delimiters_wide': wrap_string( + d_wide.encode('unicode-escape').decode(), + startstring=" delimiters += (u'") + } + + print(module_template % substitutions) # test prints |
