1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
# -*- coding: utf-8 -*-
__all__ = ['slugify']
import re
import unicodedata
import types
import sys
from htmlentitydefs import name2codepoint
from unidecode import unidecode
# character entity reference
CHAR_ENTITY_REXP = re.compile('&(%s);' % '|'.join(name2codepoint))
# decimal character reference
DECIMAL_REXP = re.compile('&#(\d+);')
# hexadecimal character reference
HEX_REXP = re.compile('&#x([\da-fA-F]+);')
REPLACE1_REXP = re.compile(r'[\']+')
REPLACE2_REXP = re.compile(r'[^-a-z0-9]+')
REMOVE_REXP = re.compile('-{2,}')
def smart_truncate(string, max_length=0, word_boundaries=False, separator=' '):
""" Truncate a string """
string = string.strip(separator)
if not max_length:
return string
if len(string) < max_length:
return string
if not word_boundaries:
return string[:max_length].strip(separator)
if separator not in string:
return string[:max_length]
truncated = ''
for word in string.split(separator):
if word:
next_len = len(truncated) + len(word) + len(separator)
if next_len <= max_length:
truncated += '{0}{1}'.format(word, separator)
if not truncated:
truncated = string[:max_length]
return truncated.strip(separator)
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator='-'):
""" Make a slug from the given text """
# text to unicode
if not isinstance(text, types.UnicodeType):
text = unicode(text, 'utf-8', 'ignore')
# decode unicode ( 影師嗎 = Ying Shi Ma)
text = unidecode(text)
# text back to unicode
if not isinstance(text, types.UnicodeType):
text = unicode(text, 'utf-8', 'ignore')
# character entity reference
if entities:
text = CHAR_ENTITY_REXP.sub(lambda m: unichr(name2codepoint[m.group(1)]), text)
# decimal character reference
if decimal:
try:
text = DECIMAL_REXP.sub(lambda m: unichr(int(m.group(1))), text)
except:
pass
# hexadecimal character reference
if hexadecimal:
try:
text = HEX_REXP.sub(lambda m: unichr(int(m.group(1), 16)), text)
except:
pass
# translate
text = unicodedata.normalize('NFKD', text)
if sys.version_info < (3,):
text = text.encode('ascii', 'ignore')
# replace unwanted characters
text = REPLACE1_REXP.sub('', text.lower()) # replace ' with nothing instead with -
text = REPLACE2_REXP.sub('-', text.lower())
# remove redundant -
text = REMOVE_REXP.sub('-', text).strip('-')
# smart truncate if requested
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, '-')
if separator != '-':
text = text.replace('-', separator)
return text
def main():
if len(sys.argv) < 2:
print "Usage %s TEXT TO SLUGIFY" % sys.argv[0]
return
text = ' '.join(sys.argv[1:])
print slugify(text)
|