diff options
Diffstat (limited to 'sphinx/util/stemmer.py')
-rw-r--r-- | sphinx/util/stemmer.py | 201 |
1 files changed, 132 insertions, 69 deletions
diff --git a/sphinx/util/stemmer.py b/sphinx/util/stemmer.py index 6d011b85c..47fc41e87 100644 --- a/sphinx/util/stemmer.py +++ b/sphinx/util/stemmer.py @@ -28,6 +28,7 @@ :license: Public Domain ("can be used free of charge for any purpose"). """ + class PorterStemmer(object): def __init__(self): @@ -49,7 +50,7 @@ class PorterStemmer(object): def cons(self, i): """cons(i) is TRUE <=> b[i] is a consonant.""" if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \ - or self.b[i] == 'o' or self.b[i] == 'u': + or self.b[i] == 'o' or self.b[i] == 'u': return 0 if self.b[i] == 'y': if i == self.k0: @@ -120,7 +121,7 @@ class PorterStemmer(object): snow, box, tray. """ if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) \ - or not self.cons(i-2): + or not self.cons(i-2): return 0 ch = self.b[i] if ch == 'w' or ch == 'x' or ch == 'y': @@ -130,7 +131,7 @@ class PorterStemmer(object): def ends(self, s): """ends(s) is TRUE <=> k0,...k ends with the string s.""" length = len(s) - if s[length - 1] != self.b[self.k]: # tiny speed-up + if s[length - 1] != self.b[self.k]: # tiny speed-up return 0 if length > (self.k - self.k0 + 1): return 0 @@ -184,9 +185,12 @@ class PorterStemmer(object): self.k = self.k - 1 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): self.k = self.j - if self.ends("at"): self.setto("ate") - elif self.ends("bl"): self.setto("ble") - elif self.ends("iz"): self.setto("ize") + if self.ends("at"): + self.setto("ate") + elif self.ends("bl"): + self.setto("ble") + elif self.ends("iz"): + self.setto("ize") elif self.doublec(self.k): self.k = self.k - 1 ch = self.b[self.k] @@ -207,100 +211,159 @@ class PorterStemmer(object): string before the suffix must give m() > 0. """ if self.b[self.k - 1] == 'a': - if self.ends("ational"): self.r("ate") - elif self.ends("tional"): self.r("tion") + if self.ends("ational"): + self.r("ate") + elif self.ends("tional"): + self.r("tion") elif self.b[self.k - 1] == 'c': - if self.ends("enci"): self.r("ence") - elif self.ends("anci"): self.r("ance") + if self.ends("enci"): + self.r("ence") + elif self.ends("anci"): + self.r("ance") elif self.b[self.k - 1] == 'e': - if self.ends("izer"): self.r("ize") + if self.ends("izer"): + self.r("ize") elif self.b[self.k - 1] == 'l': - if self.ends("bli"): self.r("ble") # --DEPARTURE-- + if self.ends("bli"): + self.r("ble") # --DEPARTURE-- # To match the published algorithm, replace this phrase with # if self.ends("abli"): self.r("able") - elif self.ends("alli"): self.r("al") - elif self.ends("entli"): self.r("ent") - elif self.ends("eli"): self.r("e") - elif self.ends("ousli"): self.r("ous") + elif self.ends("alli"): + self.r("al") + elif self.ends("entli"): + self.r("ent") + elif self.ends("eli"): + self.r("e") + elif self.ends("ousli"): + self.r("ous") elif self.b[self.k - 1] == 'o': - if self.ends("ization"): self.r("ize") - elif self.ends("ation"): self.r("ate") - elif self.ends("ator"): self.r("ate") + if self.ends("ization"): + self.r("ize") + elif self.ends("ation"): + self.r("ate") + elif self.ends("ator"): + self.r("ate") elif self.b[self.k - 1] == 's': - if self.ends("alism"): self.r("al") - elif self.ends("iveness"): self.r("ive") - elif self.ends("fulness"): self.r("ful") - elif self.ends("ousness"): self.r("ous") + if self.ends("alism"): + self.r("al") + elif self.ends("iveness"): + self.r("ive") + elif self.ends("fulness"): + self.r("ful") + elif self.ends("ousness"): + self.r("ous") elif self.b[self.k - 1] == 't': - if self.ends("aliti"): self.r("al") - elif self.ends("iviti"): self.r("ive") - elif self.ends("biliti"): self.r("ble") - elif self.b[self.k - 1] == 'g': # --DEPARTURE-- - if self.ends("logi"): self.r("log") + if self.ends("aliti"): + self.r("al") + elif self.ends("iviti"): + self.r("ive") + elif self.ends("biliti"): + self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): + self.r("log") # To match the published algorithm, delete this phrase def step3(self): """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" if self.b[self.k] == 'e': - if self.ends("icate"): self.r("ic") - elif self.ends("ative"): self.r("") - elif self.ends("alize"): self.r("al") + if self.ends("icate"): + self.r("ic") + elif self.ends("ative"): + self.r("") + elif self.ends("alize"): + self.r("al") elif self.b[self.k] == 'i': - if self.ends("iciti"): self.r("ic") + if self.ends("iciti"): + self.r("ic") elif self.b[self.k] == 'l': - if self.ends("ical"): self.r("ic") - elif self.ends("ful"): self.r("") + if self.ends("ical"): + self.r("ic") + elif self.ends("ful"): + self.r("") elif self.b[self.k] == 's': - if self.ends("ness"): self.r("") + if self.ends("ness"): + self.r("") def step4(self): """step4() takes off -ant, -ence etc., in context <c>vcvc<v>.""" if self.b[self.k - 1] == 'a': - if self.ends("al"): pass - else: return + if self.ends("al"): + pass + else: + return elif self.b[self.k - 1] == 'c': - if self.ends("ance"): pass - elif self.ends("ence"): pass - else: return + if self.ends("ance"): + pass + elif self.ends("ence"): + pass + else: + return elif self.b[self.k - 1] == 'e': - if self.ends("er"): pass - else: return + if self.ends("er"): + pass + else: + return elif self.b[self.k - 1] == 'i': - if self.ends("ic"): pass - else: return + if self.ends("ic"): + pass + else: + return elif self.b[self.k - 1] == 'l': - if self.ends("able"): pass - elif self.ends("ible"): pass - else: return + if self.ends("able"): + pass + elif self.ends("ible"): + pass + else: + return elif self.b[self.k - 1] == 'n': - if self.ends("ant"): pass - elif self.ends("ement"): pass - elif self.ends("ment"): pass - elif self.ends("ent"): pass - else: return + if self.ends("ant"): + pass + elif self.ends("ement"): + pass + elif self.ends("ment"): + pass + elif self.ends("ent"): + pass + else: + return elif self.b[self.k - 1] == 'o': - if self.ends("ion") and (self.b[self.j] == 's' - or self.b[self.j] == 't'): pass - elif self.ends("ou"): pass + if self.ends("ion") and (self.b[self.j] == 's' or + self.b[self.j] == 't'): + pass + elif self.ends("ou"): + pass # takes care of -ous - else: return + else: + return elif self.b[self.k - 1] == 's': - if self.ends("ism"): pass - else: return + if self.ends("ism"): + pass + else: + return elif self.b[self.k - 1] == 't': - if self.ends("ate"): pass - elif self.ends("iti"): pass - else: return + if self.ends("ate"): + pass + elif self.ends("iti"): + pass + else: + return elif self.b[self.k - 1] == 'u': - if self.ends("ous"): pass - else: return + if self.ends("ous"): + pass + else: + return elif self.b[self.k - 1] == 'v': - if self.ends("ive"): pass - else: return + if self.ends("ive"): + pass + else: + return elif self.b[self.k - 1] == 'z': - if self.ends("ize"): pass - else: return + if self.ends("ize"): + pass + else: + return else: return if self.m() > 1: @@ -316,7 +379,7 @@ class PorterStemmer(object): if a > 1 or (a == 1 and not self.cvc(self.k-1)): self.k = self.k - 1 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: - self.k = self.k -1 + self.k = self.k - 1 def stem(self, p, i, j): """In stem(p,i,j), p is a char pointer, and the string to be stemmed @@ -332,7 +395,7 @@ class PorterStemmer(object): self.k = j self.k0 = i if self.k <= self.k0 + 1: - return self.b # --DEPARTURE-- + return self.b # --DEPARTURE-- # With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the |