1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
# unicode_denormalizer.py
#
# Demonstration of the pyparsing's transform_string() method, to
# convert identifiers in Python source code to equivalent Unicode
# characters. Python's compiler automatically normalizes Unicode
# characters back to their ASCII equivalents, so that identifiers may
# be rewritten using other Unicode characters, and normalize back to
# the same identifier. For instance, Python treats "print" and "𝕡𝓻ᵢ𝓃𝘁"
# and "𝖕𝒓𝗂𝑛ᵗ" all as the same identifier.
#
# The converter must take care to *only* transform identifiers -
# Python keywords must always be represented in base ASCII form. To
# skip over keywords, they are added to the parser/transformer, but
# contain no transforming parse action.
#
# The converter also detects identifiers in placeholders within f-strings.
#
# Copyright 2022, by Paul McGuire
#
import keyword
import random
import unicodedata
import pyparsing as pp
ppu = pp.pyparsing_unicode
ident_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_0123456789·"
ident_char_map = {}.fromkeys(ident_chars, "")
for ch in ppu.identbodychars:
normal = unicodedata.normalize("NFKC", ch)
if normal in ident_char_map:
ident_char_map[normal] += ch
ligature_map = {
'ffl': 'ffl ffl ffl ffl ffl',
'ffi': 'ffi ffi ffi ffi ffi',
'ff': 'ff ff',
'fi': 'fi fi',
'fl': 'fl fl',
'ij': 'ij ij',
'lj': 'lj lj',
'nj': 'nj nj',
'dz': 'dz dz',
'ii': 'ii ⅱ',
'iv': 'iv ⅳ',
'vi': 'vi ⅵ',
'ix': 'ix ⅸ',
'xi': 'xi ⅺ',
}
ligature_transformer = pp.oneOf(ligature_map).add_parse_action(lambda t: random.choice(ligature_map[t[0]].split()))
def make_mixed_font(t):
t_0 = t[0][0]
ret = ['_' if t_0 == '_' else random.choice(ident_char_map.get(t_0, t_0))]
t_rest = ligature_transformer.transform_string(t[0][1:])
ret.extend(random.choice(ident_char_map.get(c, c)) for c in t_rest)
return ''.join(ret)
identifier = pp.pyparsing_common.identifier
identifier.add_parse_action(make_mixed_font)
python_quoted_string = pp.Opt(pp.Char("fF")("f_string_prefix")) + (
pp.quotedString
| pp.QuotedString('"""', multiline=True, unquoteResults=False)
| pp.QuotedString("'''", multiline=True, unquoteResults=False)
)("quoted_string_body")
def mix_fstring_expressions(t):
if not t.f_string_prefix:
return
fstring_arg = pp.QuotedString("{", end_quote_char="}")
fstring_arg.add_parse_action(lambda tt: "{" + transformer.transform_string(tt[0]) + "}")
ret = t.f_string_prefix + fstring_arg.transform_string(t.quoted_string_body)
return ret
python_quoted_string.add_parse_action(mix_fstring_expressions)
any_keyword = pp.MatchFirst(map(pp.Keyword, list(keyword.kwlist) + getattr(keyword, "softkwlist", [])))
# quoted strings and keywords will be parsed, but left untransformed
transformer = python_quoted_string | any_keyword | identifier
def demo():
import textwrap
hello_source = textwrap.dedent("""
def hello():
try:
hello_ = "Hello"
world_ = "World"
print(f"{hello_}, {world_}!")
except TypeError as exc:
print("failed: {}".format(exc))
if __name__ == "__main__":
hello()
""")
source = hello_source
transformed = transformer.transform_string(source)
print(transformed)
# does it really work?
code = compile(transformed, source, mode="exec")
exec(code)
if 0:
# pick some code from the stdlib
import unittest.util as lib_module
import inspect
source = inspect.getsource(lib_module)
transformed = transformer.transform_string(source)
print()
print(transformed)
if __name__ == '__main__':
demo()
|