summaryrefslogtreecommitdiff
path: root/sphinx/util/stemmer.py
diff options
context:
space:
mode:
Diffstat (limited to 'sphinx/util/stemmer.py')
-rw-r--r--sphinx/util/stemmer.py201
1 files changed, 132 insertions, 69 deletions
diff --git a/sphinx/util/stemmer.py b/sphinx/util/stemmer.py
index 6d011b85c..47fc41e87 100644
--- a/sphinx/util/stemmer.py
+++ b/sphinx/util/stemmer.py
@@ -28,6 +28,7 @@
:license: Public Domain ("can be used free of charge for any purpose").
"""
+
class PorterStemmer(object):
def __init__(self):
@@ -49,7 +50,7 @@ class PorterStemmer(object):
def cons(self, i):
"""cons(i) is TRUE <=> b[i] is a consonant."""
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
- or self.b[i] == 'o' or self.b[i] == 'u':
+ or self.b[i] == 'o' or self.b[i] == 'u':
return 0
if self.b[i] == 'y':
if i == self.k0:
@@ -120,7 +121,7 @@ class PorterStemmer(object):
snow, box, tray.
"""
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) \
- or not self.cons(i-2):
+ or not self.cons(i-2):
return 0
ch = self.b[i]
if ch == 'w' or ch == 'x' or ch == 'y':
@@ -130,7 +131,7 @@ class PorterStemmer(object):
def ends(self, s):
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
length = len(s)
- if s[length - 1] != self.b[self.k]: # tiny speed-up
+ if s[length - 1] != self.b[self.k]: # tiny speed-up
return 0
if length > (self.k - self.k0 + 1):
return 0
@@ -184,9 +185,12 @@ class PorterStemmer(object):
self.k = self.k - 1
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
self.k = self.j
- if self.ends("at"): self.setto("ate")
- elif self.ends("bl"): self.setto("ble")
- elif self.ends("iz"): self.setto("ize")
+ if self.ends("at"):
+ self.setto("ate")
+ elif self.ends("bl"):
+ self.setto("ble")
+ elif self.ends("iz"):
+ self.setto("ize")
elif self.doublec(self.k):
self.k = self.k - 1
ch = self.b[self.k]
@@ -207,100 +211,159 @@ class PorterStemmer(object):
string before the suffix must give m() > 0.
"""
if self.b[self.k - 1] == 'a':
- if self.ends("ational"): self.r("ate")
- elif self.ends("tional"): self.r("tion")
+ if self.ends("ational"):
+ self.r("ate")
+ elif self.ends("tional"):
+ self.r("tion")
elif self.b[self.k - 1] == 'c':
- if self.ends("enci"): self.r("ence")
- elif self.ends("anci"): self.r("ance")
+ if self.ends("enci"):
+ self.r("ence")
+ elif self.ends("anci"):
+ self.r("ance")
elif self.b[self.k - 1] == 'e':
- if self.ends("izer"): self.r("ize")
+ if self.ends("izer"):
+ self.r("ize")
elif self.b[self.k - 1] == 'l':
- if self.ends("bli"): self.r("ble") # --DEPARTURE--
+ if self.ends("bli"):
+ self.r("ble") # --DEPARTURE--
# To match the published algorithm, replace this phrase with
# if self.ends("abli"): self.r("able")
- elif self.ends("alli"): self.r("al")
- elif self.ends("entli"): self.r("ent")
- elif self.ends("eli"): self.r("e")
- elif self.ends("ousli"): self.r("ous")
+ elif self.ends("alli"):
+ self.r("al")
+ elif self.ends("entli"):
+ self.r("ent")
+ elif self.ends("eli"):
+ self.r("e")
+ elif self.ends("ousli"):
+ self.r("ous")
elif self.b[self.k - 1] == 'o':
- if self.ends("ization"): self.r("ize")
- elif self.ends("ation"): self.r("ate")
- elif self.ends("ator"): self.r("ate")
+ if self.ends("ization"):
+ self.r("ize")
+ elif self.ends("ation"):
+ self.r("ate")
+ elif self.ends("ator"):
+ self.r("ate")
elif self.b[self.k - 1] == 's':
- if self.ends("alism"): self.r("al")
- elif self.ends("iveness"): self.r("ive")
- elif self.ends("fulness"): self.r("ful")
- elif self.ends("ousness"): self.r("ous")
+ if self.ends("alism"):
+ self.r("al")
+ elif self.ends("iveness"):
+ self.r("ive")
+ elif self.ends("fulness"):
+ self.r("ful")
+ elif self.ends("ousness"):
+ self.r("ous")
elif self.b[self.k - 1] == 't':
- if self.ends("aliti"): self.r("al")
- elif self.ends("iviti"): self.r("ive")
- elif self.ends("biliti"): self.r("ble")
- elif self.b[self.k - 1] == 'g': # --DEPARTURE--
- if self.ends("logi"): self.r("log")
+ if self.ends("aliti"):
+ self.r("al")
+ elif self.ends("iviti"):
+ self.r("ive")
+ elif self.ends("biliti"):
+ self.r("ble")
+ elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+ if self.ends("logi"):
+ self.r("log")
# To match the published algorithm, delete this phrase
def step3(self):
"""step3() dels with -ic-, -full, -ness etc. similar strategy
to step2."""
if self.b[self.k] == 'e':
- if self.ends("icate"): self.r("ic")
- elif self.ends("ative"): self.r("")
- elif self.ends("alize"): self.r("al")
+ if self.ends("icate"):
+ self.r("ic")
+ elif self.ends("ative"):
+ self.r("")
+ elif self.ends("alize"):
+ self.r("al")
elif self.b[self.k] == 'i':
- if self.ends("iciti"): self.r("ic")
+ if self.ends("iciti"):
+ self.r("ic")
elif self.b[self.k] == 'l':
- if self.ends("ical"): self.r("ic")
- elif self.ends("ful"): self.r("")
+ if self.ends("ical"):
+ self.r("ic")
+ elif self.ends("ful"):
+ self.r("")
elif self.b[self.k] == 's':
- if self.ends("ness"): self.r("")
+ if self.ends("ness"):
+ self.r("")
def step4(self):
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
if self.b[self.k - 1] == 'a':
- if self.ends("al"): pass
- else: return
+ if self.ends("al"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'c':
- if self.ends("ance"): pass
- elif self.ends("ence"): pass
- else: return
+ if self.ends("ance"):
+ pass
+ elif self.ends("ence"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'e':
- if self.ends("er"): pass
- else: return
+ if self.ends("er"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'i':
- if self.ends("ic"): pass
- else: return
+ if self.ends("ic"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'l':
- if self.ends("able"): pass
- elif self.ends("ible"): pass
- else: return
+ if self.ends("able"):
+ pass
+ elif self.ends("ible"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'n':
- if self.ends("ant"): pass
- elif self.ends("ement"): pass
- elif self.ends("ment"): pass
- elif self.ends("ent"): pass
- else: return
+ if self.ends("ant"):
+ pass
+ elif self.ends("ement"):
+ pass
+ elif self.ends("ment"):
+ pass
+ elif self.ends("ent"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'o':
- if self.ends("ion") and (self.b[self.j] == 's'
- or self.b[self.j] == 't'): pass
- elif self.ends("ou"): pass
+ if self.ends("ion") and (self.b[self.j] == 's' or
+ self.b[self.j] == 't'):
+ pass
+ elif self.ends("ou"):
+ pass
# takes care of -ous
- else: return
+ else:
+ return
elif self.b[self.k - 1] == 's':
- if self.ends("ism"): pass
- else: return
+ if self.ends("ism"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 't':
- if self.ends("ate"): pass
- elif self.ends("iti"): pass
- else: return
+ if self.ends("ate"):
+ pass
+ elif self.ends("iti"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'u':
- if self.ends("ous"): pass
- else: return
+ if self.ends("ous"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'v':
- if self.ends("ive"): pass
- else: return
+ if self.ends("ive"):
+ pass
+ else:
+ return
elif self.b[self.k - 1] == 'z':
- if self.ends("ize"): pass
- else: return
+ if self.ends("ize"):
+ pass
+ else:
+ return
else:
return
if self.m() > 1:
@@ -316,7 +379,7 @@ class PorterStemmer(object):
if a > 1 or (a == 1 and not self.cvc(self.k-1)):
self.k = self.k - 1
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
- self.k = self.k -1
+ self.k = self.k - 1
def stem(self, p, i, j):
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
@@ -332,7 +395,7 @@ class PorterStemmer(object):
self.k = j
self.k0 = i
if self.k <= self.k0 + 1:
- return self.b # --DEPARTURE--
+ return self.b # --DEPARTURE--
# With this line, strings of length 1 or 2 don't go through the
# stemming process, although no mention is made of this in the