diff options
Diffstat (limited to 'slugify/slugify.py')
-rw-r--r-- | slugify/slugify.py | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/slugify/slugify.py b/slugify/slugify.py new file mode 100644 index 0000000..ba15274 --- /dev/null +++ b/slugify/slugify.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +__all__ = ['slugify'] + +import re +import unicodedata +import types +import sys +from htmlentitydefs import name2codepoint +from unidecode import unidecode + +# character entity reference +CHAR_ENTITY_REXP = re.compile('&(%s);' % '|'.join(name2codepoint)) + +# decimal character reference +DECIMAL_REXP = re.compile('&#(\d+);') + +# hexadecimal character reference +HEX_REXP = re.compile('&#x([\da-fA-F]+);') + +REPLACE1_REXP = re.compile(r'[\']+') +REPLACE2_REXP = re.compile(r'[^-a-z0-9]+') +REMOVE_REXP = re.compile('-{2,}') + + +def smart_truncate(string, max_length=0, word_boundaries=False, separator=' '): + """ Truncate a string """ + + string = string.strip(separator) + + if not max_length: + return string + + if len(string) < max_length: + return string + + if not word_boundaries: + return string[:max_length].strip(separator) + + if separator not in string: + return string[:max_length] + + truncated = '' + for word in string.split(separator): + if word: + next_len = len(truncated) + len(word) + len(separator) + if next_len <= max_length: + truncated += '{0}{1}'.format(word, separator) + if not truncated: + truncated = string[:max_length] + return truncated.strip(separator) + + +def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator='-'): + """ Make a slug from the given text """ + + # text to unicode + if not isinstance(text, types.UnicodeType): + text = unicode(text, 'utf-8', 'ignore') + + # decode unicode ( 影師嗎 = Ying Shi Ma) + text = unidecode(text) + + # text back to unicode + if not isinstance(text, types.UnicodeType): + text = unicode(text, 'utf-8', 'ignore') + + # character entity reference + if entities: + text = CHAR_ENTITY_REXP.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) + + # decimal character reference + if decimal: + try: + text = DECIMAL_REXP.sub(lambda m: unichr(int(m.group(1))), text) + except: + pass + + # hexadecimal character reference + if hexadecimal: + try: + text = HEX_REXP.sub(lambda m: unichr(int(m.group(1), 16)), text) + except: + pass + + # translate + text = unicodedata.normalize('NFKD', text) + if sys.version_info < (3,): + text = text.encode('ascii', 'ignore') + + # replace unwanted characters + text = REPLACE1_REXP.sub('', text.lower()) # replace ' with nothing instead with - + text = REPLACE2_REXP.sub('-', text.lower()) + + # remove redundant - + text = REMOVE_REXP.sub('-', text).strip('-') + + # smart truncate if requested + if max_length > 0: + text = smart_truncate(text, max_length, word_boundary, '-') + + if separator != '-': + text = text.replace('-', separator) + + return text + + +def main(): + if len(sys.argv) < 2: + print "Usage %s TEXT TO SLUGIFY" % sys.argv[0] + return + text = ' '.join(sys.argv[1:]) + print slugify(text) |