summaryrefslogtreecommitdiff
path: root/slugify/slugify.py
diff options
context:
space:
mode:
Diffstat (limited to 'slugify/slugify.py')
-rw-r--r--slugify/slugify.py113
1 files changed, 113 insertions, 0 deletions
diff --git a/slugify/slugify.py b/slugify/slugify.py
new file mode 100644
index 0000000..ba15274
--- /dev/null
+++ b/slugify/slugify.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+
+__all__ = ['slugify']
+
+import re
+import unicodedata
+import types
+import sys
+from htmlentitydefs import name2codepoint
+from unidecode import unidecode
+
+# character entity reference
+CHAR_ENTITY_REXP = re.compile('&(%s);' % '|'.join(name2codepoint))
+
+# decimal character reference
+DECIMAL_REXP = re.compile('&#(\d+);')
+
+# hexadecimal character reference
+HEX_REXP = re.compile('&#x([\da-fA-F]+);')
+
+REPLACE1_REXP = re.compile(r'[\']+')
+REPLACE2_REXP = re.compile(r'[^-a-z0-9]+')
+REMOVE_REXP = re.compile('-{2,}')
+
+
+def smart_truncate(string, max_length=0, word_boundaries=False, separator=' '):
+ """ Truncate a string """
+
+ string = string.strip(separator)
+
+ if not max_length:
+ return string
+
+ if len(string) < max_length:
+ return string
+
+ if not word_boundaries:
+ return string[:max_length].strip(separator)
+
+ if separator not in string:
+ return string[:max_length]
+
+ truncated = ''
+ for word in string.split(separator):
+ if word:
+ next_len = len(truncated) + len(word) + len(separator)
+ if next_len <= max_length:
+ truncated += '{0}{1}'.format(word, separator)
+ if not truncated:
+ truncated = string[:max_length]
+ return truncated.strip(separator)
+
+
+def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator='-'):
+ """ Make a slug from the given text """
+
+ # text to unicode
+ if not isinstance(text, types.UnicodeType):
+ text = unicode(text, 'utf-8', 'ignore')
+
+ # decode unicode ( 影師嗎 = Ying Shi Ma)
+ text = unidecode(text)
+
+ # text back to unicode
+ if not isinstance(text, types.UnicodeType):
+ text = unicode(text, 'utf-8', 'ignore')
+
+ # character entity reference
+ if entities:
+ text = CHAR_ENTITY_REXP.sub(lambda m: unichr(name2codepoint[m.group(1)]), text)
+
+ # decimal character reference
+ if decimal:
+ try:
+ text = DECIMAL_REXP.sub(lambda m: unichr(int(m.group(1))), text)
+ except:
+ pass
+
+ # hexadecimal character reference
+ if hexadecimal:
+ try:
+ text = HEX_REXP.sub(lambda m: unichr(int(m.group(1), 16)), text)
+ except:
+ pass
+
+ # translate
+ text = unicodedata.normalize('NFKD', text)
+ if sys.version_info < (3,):
+ text = text.encode('ascii', 'ignore')
+
+ # replace unwanted characters
+ text = REPLACE1_REXP.sub('', text.lower()) # replace ' with nothing instead with -
+ text = REPLACE2_REXP.sub('-', text.lower())
+
+ # remove redundant -
+ text = REMOVE_REXP.sub('-', text).strip('-')
+
+ # smart truncate if requested
+ if max_length > 0:
+ text = smart_truncate(text, max_length, word_boundary, '-')
+
+ if separator != '-':
+ text = text.replace('-', separator)
+
+ return text
+
+
+def main():
+ if len(sys.argv) < 2:
+ print "Usage %s TEXT TO SLUGIFY" % sys.argv[0]
+ return
+ text = ' '.join(sys.argv[1:])
+ print slugify(text)