summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVal Neekman <un33kvu@gmail.com>2015-05-06 10:39:39 -0400
committerVal Neekman <un33kvu@gmail.com>2015-05-06 10:39:39 -0400
commitfe2d4b56dfe5a7e2b50e4c2762ff2a88f77257da (patch)
treee67d46fe40199d61e5b0c4352c25a0a36fbe2af8
parentc800617dfd45608df60c9b667381801a1544ff58 (diff)
parentf60687c611f6ef8262870ef5f50bcd54242b904b (diff)
downloadpython-slugify-fe2d4b56dfe5a7e2b50e4c2762ff2a88f77257da.tar.gz
Merge pull request #15 from euangoddard/master
Add support for stopwords
-rw-r--r--slugify/slugify.py9
-rw-r--r--test.py59
2 files changed, 53 insertions, 15 deletions
diff --git a/slugify/slugify.py b/slugify/slugify.py
index 82f67fe..5951eb3 100644
--- a/slugify/slugify.py
+++ b/slugify/slugify.py
@@ -74,7 +74,7 @@ def smart_truncate(string, max_length=0, word_boundaries=False, separator=' ', s
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
- separator='-', save_order=False):
+ separator='-', save_order=False, stopwords=()):
"""Make a slug from the given text.
:param text (str): initial text
:param entities (bool):
@@ -84,6 +84,7 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
:param word_boundary (bool):
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
:param separator (str): separator between words
+ :param stopwords (iterable): words to discount
:return (str):
"""
@@ -128,6 +129,12 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
# remove redundant -
text = REMOVE_REXP.sub('-', text).strip('-')
+ # remove stopwords
+ if stopwords:
+ stopwords_lower = [s.lower() for s in stopwords]
+ words = [w for w in text.split(separator) if w not in stopwords_lower]
+ text = separator.join(words)
+
# smart truncate if requested
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, '-', save_order)
diff --git a/test.py b/test.py
index 88027ee..ee1a04c 100644
--- a/test.py
+++ b/test.py
@@ -4,22 +4,33 @@ import unittest
from slugify import slugify
-class TestSequenceFunctions(unittest.TestCase):
+class TestSlugification(unittest.TestCase):
- def test_manager(self):
+ def test_extraneous_seperators(self):
txt = "This is a test ---"
r = slugify(txt)
self.assertEqual(r, "this-is-a-test")
+ txt = "___This is a test ---"
+ r = slugify(txt)
+ self.assertEqual(r, "this-is-a-test")
+
+ txt = "___This is a test___"
+ r = slugify(txt)
+ self.assertEqual(r, "this-is-a-test")
+
+ def test_non_word_characters(self):
txt = "This -- is a ## test ---"
r = slugify(txt)
self.assertEqual(r, "this-is-a-test")
+ def test_phonetic_conversion_of_eastern_scripts(self):
txt = '影師嗎'
r = slugify(txt)
self.assertEqual(r, "ying-shi-ma")
+ def test_accented_text(self):
txt = 'C\'est déjà l\'été.'
r = slugify(txt)
self.assertEqual(r, "cest-deja-lete")
@@ -28,14 +39,17 @@ class TestSequenceFunctions(unittest.TestCase):
r = slugify(txt)
self.assertEqual(r, "nin-hao-wo-shi-zhong-guo-ren")
- txt = 'Компьютер'
- r = slugify(txt)
- self.assertEqual(r, "kompiuter")
-
+ def test_accented_text_with_non_word_characters(self):
txt = 'jaja---lol-méméméoo--a'
r = slugify(txt)
self.assertEqual(r, "jaja-lol-mememeoo-a")
+ def test_cyrillic_text(self):
+ txt = 'Компьютер'
+ r = slugify(txt)
+ self.assertEqual(r, "kompiuter")
+
+ def test_max_length(self):
txt = 'jaja---lol-méméméoo--a'
r = slugify(txt, max_length=9)
self.assertEqual(r, "jaja-lol")
@@ -44,10 +58,12 @@ class TestSequenceFunctions(unittest.TestCase):
r = slugify(txt, max_length=15)
self.assertEqual(r, "jaja-lol-mememe")
+ def test_max_length_cutoff_not_required(self):
txt = 'jaja---lol-méméméoo--a'
r = slugify(txt, max_length=50)
self.assertEqual(r, "jaja-lol-mememeoo-a")
+ def test_word_boundary(self):
txt = 'jaja---lol-méméméoo--a'
r = slugify(txt, max_length=15, word_boundary=True)
self.assertEqual(r, "jaja-lol-a")
@@ -64,22 +80,17 @@ class TestSequenceFunctions(unittest.TestCase):
r = slugify(txt, max_length=19, word_boundary=True)
self.assertEqual(r, "jaja-lol-mememeoo-a")
+ def test_custom_separator(self):
txt = 'jaja---lol-méméméoo--a'
r = slugify(txt, max_length=20, word_boundary=True, separator=".")
self.assertEqual(r, "jaja.lol.mememeoo.a")
+ def test_multi_character_separator(self):
txt = 'jaja---lol-méméméoo--a'
r = slugify(txt, max_length=20, word_boundary=True, separator="ZZZZZZ")
self.assertEqual(r, "jajaZZZZZZlolZZZZZZmememeooZZZZZZa")
- txt = "___This is a test ---"
- r = slugify(txt)
- self.assertEqual(r, "this-is-a-test")
-
- txt = "___This is a test___"
- r = slugify(txt)
- self.assertEqual(r, "this-is-a-test")
-
+ def test_save_order(self):
txt = 'one two three four five'
r = slugify(txt, max_length=13, word_boundary=True, save_order=True)
self.assertEqual(r, "one-two-three")
@@ -96,5 +107,25 @@ class TestSequenceFunctions(unittest.TestCase):
r = slugify(txt, max_length=12, word_boundary=True, save_order=True)
self.assertEqual(r, "one-two")
+ def test_stopword_removal(self):
+ txt = 'this has a stopword'
+ r = slugify(txt, stopwords=['stopword'])
+ self.assertEqual(r, 'this-has-a')
+
+ def test_multiple_stopword_occurances(self):
+ txt = 'the quick brown fox jumps over the lazy dog'
+ r = slugify(txt, stopwords=['the'])
+ self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog')
+
+ def test_differently_cased_stopword_match(self):
+ txt = 'Foo A FOO B foo C'
+ r = slugify(txt, stopwords=['foo'])
+ self.assertEqual(r, 'a-b-c')
+
+ txt = 'Foo A FOO B foo C'
+ r = slugify(txt, stopwords=['FOO'])
+ self.assertEqual(r, 'a-b-c')
+
+
if __name__ == '__main__':
unittest.main()