Generate the complete punctuation_chars module with the corresponding tool.

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@8016 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2017-01-17 15:06:17 +0000
committer: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2017-01-17 15:06:17 +0000
commit: f06ddb05d28b578df8560d3e96475f2b924e503c (patch)
tree: 36458f4af6da46f7893a72d375207cac120fd2b1 /docutils/tools/dev
parent: eca3ae63b963b8183831c2af653f61274df13dcc (diff)
download: docutils-f06ddb05d28b578df8560d3e96475f2b924e503c.tar.gz
1 files changed, 190 insertions, 60 deletions
diff --git a/docutils/tools/dev/generate_punctuation_chars.py b/docutils/tools/dev/generate_punctuation_chars.py
index eb1c73372..504780a13 100644
--- a/docutils/tools/dev/generate_punctuation_chars.py
+++ b/docutils/tools/dev/generate_punctuation_chars.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-# :Copyright: © 2011, 2016 Günter Milde.
+# :Copyright: © 2011, 2017 Günter Milde.
 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
 #
 #    Copying and distribution of this file, with or without modification,
@@ -14,37 +14,131 @@
 #
 # ::
 
-import sys, re
-import unicodedata
-
-# import the punctuation_chars module from the source or Py3k build
-# path for local Python modules
-if sys.version_info < (3,):
-    sys.path.insert(0, '../../docutils')
-else:
-    sys.path.insert(0, '../../build/lib')
-    unichr = chr
-
-from docutils.utils.punctuation_chars import (openers, closers, delimiters,
-                                              closing_delimiters)
+"""(Re)generate the utils.punctuation_chars module."""
 
 # (re)generate the utils.punctuation_chars module
 # ===============================================
 #
-# The category of some characters may change with the development of the
+# The category of some characters can change with the development of the
 # Unicode standard. This tool checks the patterns in `utils.punctuation_chars`
 # against a re-calculation based on the "unicodedata" stdlib module
 # which may give different results for different Python versions.
 #
-# Updating the patterns with a new (Python|Unicode standard) version is an API
-# change (may render valid rST documents invalid). It should only be done for
-# "feature releases" and requires also updating the specification of `inline
-# markup recognition rules`_ in ../../docs/ref/rst/restructuredtext.txt.
+# Updating the module with changed `unicode_punctuation_categories` (due to
+# a new Python or Unicode standard version is an API cange (may render valid
+# rST documents invalid). It should only be done for "feature releases" and
+# requires also updating the specification of `inline markup recognition
+# rules`_ in ../../docs/ref/rst/restructuredtext.txt.
+#
+# .. _inline markup recognition rules:
+#     ../../docs/ref/rst/restructuredtext.html#inline-markup
+
+
+# Setup::
+
+import sys, re
+import unicodedata
+
+if sys.version_info >= (3,):
+    unichr = chr # unichr not available in Py3k
+else:
+    import codecs
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+
+
+# Template for utils.punctuation_chars
+# ------------------------------------
 #
+# Problem: ``ur`` prefix fails with Py 3.5 ::
+
+module_template = u'''#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# :Id: $Id$
+# :Copyright: © 2011, 2017 Günter Milde.
+# :License: Released under the terms of the `2-Clause BSD license`_, in short:
+#
+#    Copying and distribution of this file, with or without modification,
+#    are permitted in any medium without royalty provided the copyright
+#    notice and this notice are preserved.
+#    This file is offered as-is, without any warranty.
+#
+# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
+#
+# This file is generated by
+# ``docutils/tools/dev/generate_punctuation_chars.py``.
+# ::
+
+import sys, re
+import unicodedata
+
+"""Docutils character category patterns.
+
+   Patterns for the implementation of the `inline markup recognition rules`_
+   in the reStructuredText parser `docutils.parsers.rst.states.py` based
+   on Unicode character categories.
+   The patterns are used inside ``[ ]`` in regular expressions.
+
+   Rule (5) requires determination of matching open/close pairs. However, the
+   pairing of open/close quotes is ambiguous due to  different typographic
+   conventions in different languages. The ``quote_pairs`` function tests
+   whether two characters form an open/close pair.
+
+   The patterns are generated by
+   ``docutils/tools/dev/generate_punctuation_chars.py`` to  prevent dependence
+   on the Python version and avoid the time-consuming generation with every
+   Docutils run. See there for motives and implementation details.
+
+   The category of some characters changed with the development of the
+   Unicode standard. The current lists are generated with the help of the
+   "unicodedata" module of Python %(python_version)s (based on Unicode version %(unidata_version)s).
+
+   .. _inline markup recognition rules:
+      http://docutils.sf.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
+"""
+
+%(openers)s
+%(closers)s
+%(delimiters)s
+if sys.maxunicode >= 0x10FFFF: # "wide" build
+%(delimiters_wide)s
+closing_delimiters = u'\\\\\\\\.,;!?'
+
+
+# Matching open/close quotes
+# --------------------------
+
+quote_pairs = {# open char: matching closing characters # usage example
+               u'\\xbb':   u'\\xbb',         # » » Swedish
+               u'\\u2018': u'\\u201a',       # ‘ ‚ Albanian/Greek/Turkish
+               u'\\u2019': u'\\u2019',       # ’ ’ Swedish
+               u'\\u201a': u'\\u2018\\u2019', # ‚ ‘ German ‚ ’ Polish
+               u'\\u201c': u'\\u201e',       # “ „ Albanian/Greek/Turkish
+               u'\\u201e': u'\\u201c\\u201d', # „ “ German „ ” Polish
+               u'\\u201d': u'\\u201d',       # ” ” Swedish
+               u'\\u203a': u'\\u203a',       # › › Swedish
+              }
+"""Additional open/close quote pairs."""
+
+def match_chars(c1, c2):
+    """Test whether `c1` and `c2` are a matching open/close character pair.
+
+    Matching open/close pairs are at the same position in
+    `punctuation_chars.openers` and `punctuation_chars.closers`.
+    The pairing of open/close quotes is ambiguous due to  different
+    typographic conventions in different languages,
+    so we test for additional matches stored in `quote_pairs`.
+    """
+    try:
+        i = openers.index(c1)
+    except ValueError:  # c1 not in openers
+        return False
+    return c2 == closers[i] or c2 in quote_pairs.get(c1, u'')\
+'''
+
+
 # Generation of the  character category patterns
 # ----------------------------------------------
 #
-#
 # Unicode punctuation character categories
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
@@ -160,11 +254,6 @@ def character_category_patterns():
     # non-matching, after markup
     closing_delimiters = [r'\\.,;!?']
 
-    # # Test open/close matching:
-    # for i in range(min(len(openers),len(closers))):
-    #     print('%4d    %s    %s' % (i, openers[i].encode('utf8'),
-    #                                closers[i].encode('utf8'))
-
     return [u''.join(chars) for chars in (openers, closers, delimiters,
                                             closing_delimiters)]
 
@@ -202,7 +291,7 @@ def mark_intervals(s):
     return ''.join(l2)
 
 def wrap_string(s, startstring= "(u'",
-                    endstring = "')", wrap=65):
+                    endstring = "')", wrap=67):
     """Line-wrap a unicode string literal definition."""
     c = len(startstring)
     contstring = "'\n" + ' ' * (len(startstring)-2) + "u'"
@@ -228,6 +317,18 @@ def print_differences(old, new, name):
         for c in old:
             if c not in new:
                 print('  %04x'%ord(c), unicodedata.name(c))
+    else:
+        print('%s unchanged' % name)
+
+def print_quote_pairs():
+    pairs = [(o,c) for o,c in quote_pairs.items()]
+    for o,c in sorted(pairs):
+        print((u'%s %s' % (o,c)).encode('utf8'))
+
+    # # Test open/close matching:
+    # for i in range(min(len(openers),len(closers))):
+    #     print('%4d    %s    %s' % (i, openers[i].encode('utf8'),
+    #                                closers[i].encode('utf8'))
 
 
 # Output
@@ -237,60 +338,89 @@ def print_differences(old, new, name):
 
 if __name__ == '__main__':
 
-# (Re)create and compare character patterns
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    import argparse
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('-t', '--test', action="store_true",
+                        help='test for changed character categories')
+    args = parser.parse_args()
+
+# (Re)create character patterns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # ::
 
     (o, c, d, cd) = character_category_patterns()
+
+# Characters in the upper plane require a "wide" build::
+
     o, o_wide = separate_wide_chars(o)
     c, c_wide = separate_wide_chars(c)
     d, d_wide = separate_wide_chars(d)
+
+# delimiters: sort and use shortcut for intervals (saves ~150 characters)
+# (`openers` and `closers` must be verbose and keep order
+# because they are also used in `match_chars()`)::
+
     d = d[:5] + mark_intervals(d[5:])
     d_wide = mark_intervals(d_wide)
 
-    print_differences(openers, o, 'openers')
-    if o_wide:
-        print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
-    print_differences(closers, c, 'closers')
-    if c_wide:
-        print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
 
-    print_differences(delimiters, d + d_wide, 'delimiters')
-    print_differences(closing_delimiters, cd, 'closing_delimiters')
-
-# Print literal code to define the character sets
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#
-# This code can be copied to punctuation_chars.py if an update is wanted.
-
-# Unicode version::
+# Test: compare module content with re-generated definitions
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# ::
 
-    print('# based on Unicode version %s' % unicodedata.unidata_version)
+    if args.test:
 
-# `openers` and `closers` must be verbose and keep order because they are
-# also used in `match_chars()`::
+# Import the punctuation_chars module from the source
+# or Py3k build path for local Python modules::
 
-    print(wrap_string(o.encode('unicode-escape').decode(),
-                      startstring="openers = (u'"))
-    print(wrap_string(c.encode('unicode-escape').decode(),
-                      startstring="closers = (u'"))
+        if sys.version_info < (3,):
+            sys.path.insert(0, '../../docutils')
+        else:
+            sys.path.insert(0, '../../build/lib')
 
-# delimiters: sort and use shortcut for intervals (saves ~150 characters)::
+        from docutils.utils.punctuation_chars import (openers, closers,
+                                          delimiters, closing_delimiters)
 
-    print(wrap_string(d.encode('unicode-escape').decode(),
-                      startstring="delimiters = (u'"))
+        print('Check for differences between the current `punctuation_chars`'
+              ' module\n and a regeneration based on Unicode version %s:'
+              % unicodedata.unidata_version)
 
-# add characters in the upper plane only in a "wide" build::
+        print_differences(openers, o, 'openers')
+        if o_wide:
+            print('+ openers-wide = ur"""%s"""' % o_wide.encode('utf8'))
+        print_differences(closers, c, 'closers')
+        if c_wide:
+            print('+ closers-wide = ur"""%s"""' % c_wide.encode('utf8'))
 
-    print('if sys.maxunicode >= 0x10FFFF: # "wide" build')
-    print(wrap_string(d_wide.encode('unicode-escape').decode(),
-                      startstring="    delimiters += (u'"))
+        print_differences(delimiters, d + d_wide, 'delimiters')
+        print_differences(closing_delimiters, cd, 'closing_delimiters')
 
-# additional closing delimiters::
+        sys.exit()
 
-    print(wrap_string(cd.encode('unicode-escape').decode(),
-                      startstring="closing_delimiters = (u'"))
+# Print re-generation of the punctuation_chars module
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# The output can be copied to docutils/utils if an update is wanted
+# (API change, see Intro).
+
+# Replacements::
+
+    substitutions = {
+        'python_version': '.'.join(str(s) for s in sys.version_info[:3]),
+        'unidata_version': unicodedata.unidata_version,
+        'openers': wrap_string(o.encode('unicode-escape').decode(),
+                               startstring="openers = (u'"),
+        'closers': wrap_string(c.encode('unicode-escape').decode(),
+                               startstring="closers = (u'"),
+        'delimiters': wrap_string(d.encode('unicode-escape').decode(),
+                                  startstring="delimiters = (u'"),
+        'delimiters_wide': wrap_string(
+                            d_wide.encode('unicode-escape').decode(),
+                            startstring="    delimiters += (u'")
+        }
+
+    print(module_template % substitutions)
 
 
 # test prints
author	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2017-01-17 15:06:17 +0000
committer	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2017-01-17 15:06:17 +0000
commit	f06ddb05d28b578df8560d3e96475f2b924e503c (patch)
tree	36458f4af6da46f7893a72d375207cac120fd2b1 /docutils/tools/dev
parent	eca3ae63b963b8183831c2af653f61274df13dcc (diff)
download	docutils-f06ddb05d28b578df8560d3e96475f2b924e503c.tar.gz