1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
|
#!/usr/bin/env python
import sys
sys.path.append(".")
from ZhConversion import *
from valid_hanzi import *
def convert(s, d, n):
out = u""
end = len(s)
begin = 0
while begin < end:
for i in range(min(n, end - begin), 0, -1):
t = s[begin:begin+i]
t = d.get(t, t if i == 1 else None)
if t:
break
out = out + t
begin += i
return out
def filter_more(records, n):
han = filter(lambda (k, v): len(k) <= n, records)
hand = dict(han)
hanm = filter(lambda (k, v): convert(k, hand, n) != v, records)
return hanm + han
def filter_func(args):
k, v = args
# length is not equal or length > 6
if len(k) != len(v) or len(k) > 6:
return False
# k includes invalid hanzi
if not all(c in valid_hanzi for c in k):
return False
# v includes invalid hanzi
if not all(c in valid_hanzi for c in v):
return False
# # check chars in k and v
# for c1, c2 in zip(k, v):
# if c1 == c2:
# continue
# if c2 not in S_2_T.get(c1, []):
# return False
return True
def get_records():
records = zh2Hant.items()
records = filter(filter_func, records)
maxlen = max(map(lambda (k,v): len(k), records))
for i in range(1, maxlen - 1):
records = filter_more(records, i)
records = map(lambda (k, v): (k.encode("utf8"), v.encode("utf8")), records)
records.sort()
return maxlen, records
def main():
print "static const char *simp_to_trad[][2] = {"
maxlen, records = get_records()
for s, ts in records:
print ' { "%s", "%s" },' % (s, ts)
print "};"
print '#define SIMP_TO_TRAD_MAX_LEN (%d)' % maxlen
if __name__ == "__main__":
main()
|