diff options
Diffstat (limited to 'slugify/__init__.py')
-rw-r--r-- | slugify/__init__.py | 113 |
1 files changed, 2 insertions, 111 deletions
diff --git a/slugify/__init__.py b/slugify/__init__.py index 0ff57f0..16dde8d 100644 --- a/slugify/__init__.py +++ b/slugify/__init__.py @@ -1,114 +1,5 @@ # -*- coding: utf-8 -*- -__version__ = '0.0.7' +__version__ = '0.0.8' -__all__ = ['slugify'] - -import re -import unicodedata -import types -import sys -from htmlentitydefs import name2codepoint -from unidecode import unidecode - -# character entity reference -CHAR_ENTITY_REXP = re.compile('&(%s);' % '|'.join(name2codepoint)) - -# decimal character reference -DECIMAL_REXP = re.compile('&#(\d+);') - -# hexadecimal character reference -HEX_REXP = re.compile('&#x([\da-fA-F]+);') - -REPLACE1_REXP = re.compile(r'[\']+') -REPLACE2_REXP = re.compile(r'[^-a-z0-9]+') -REMOVE_REXP = re.compile('-{2,}') - -def smart_truncate(string, max_length=0, word_boundaries=False, separator=' '): - """ Truncate a string """ - - string = string.strip(separator) - - if not max_length: - return string - - if len(string) < max_length: - return string - - if not word_boundaries: - return string[:max_length].strip(separator) - - if separator not in string: - return string[:max_length] - - truncated = '' - for word in string.split(separator): - if word: - next_len = len(truncated) + len(word) + len(separator) - if next_len <= max_length: - truncated += '{0}{1}'.format(word, separator) - if not truncated: - truncated = string[:max_length] - return truncated.strip(separator) - - -def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator='-'): - """ Make a slug from the given text """ - - # text to unicode - if type(text) != types.UnicodeType: - text = unicode(text, 'utf-8', 'ignore') - - # decode unicode ( 影師嗎 = Ying Shi Ma) - text = unidecode(text) - - # text back to unicode - if type(text) != types.UnicodeType: - text = unicode(text, 'utf-8', 'ignore') - - # character entity reference - if entities: - text = CHAR_ENTITY_REXP.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) - - # decimal character reference - if decimal: - try: - text = DECIMAL_REXP.sub(lambda m: unichr(int(m.group(1))), text) - except: - pass - - # hexadecimal character reference - if hexadecimal: - try: - text = HEX_REXP.sub(lambda m: unichr(int(m.group(1), 16)), text) - except: - pass - - # translate - text = unicodedata.normalize('NFKD', text) - if sys.version_info < (3,): - text = text.encode('ascii', 'ignore') - - # replace unwanted characters - text = REPLACE1_REXP.sub('', text.lower()) # replace ' with nothing instead with - - text = REPLACE2_REXP.sub('-', text.lower()) - - # remove redundant - - text = REMOVE_REXP.sub('-', text).strip('-') - - # smart truncate if requested - if max_length > 0: - text = smart_truncate(text, max_length, word_boundary, '-') - - if separator != '-': - text = text.replace('-', separator) - - return text - - -def main(): - if len(sys.argv) < 2: - print "Usage %s TEXT TO SLUGIFY" % sys.argv[0] - return - text = ' '.join(sys.argv[1:]) - print slugify(text) +from slugify import * |