00001 /*========================================================================== 00002 * Copyright (c) 2005 University of Massachusetts. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 /* This is the Porter stemming algorithm, coded up in ANSI C by the 00014 author. It may be be regarded as cononical, in that it follows the 00015 algorithm presented in 00016 00017 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 00018 no. 3, pp 130-137, 00019 00020 only differing from it at the points maked --DEPARTURE-- below. 00021 00022 See also http://www.omsee.com/~martin/stem.html 00023 00024 The algorithm as described in the paper could be exactly replicated 00025 by adjusting the points of DEPARTURE, but this is barely necessary, 00026 because (a) the points of DEPARTURE are definitely improvements, and 00027 (b) no encoding of the Porter stemmer I have seen is anything like 00028 as exact as this version, even with the points of DEPARTURE! 00029 00030 You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which 00031 'stem' takes a list of inputs and sends the stemmed equivalent to 00032 stdout. 00033 00034 The algorithm as encoded here is particularly fast. 00035 00036 Release 1 00037 00038 07/31/2005 -- Rewrite to be a standalone object. 00039 */ 00040 00041 00042 /* The main part of the stemming algorithm starts here. b is a buffer 00043 holding a word to be stemmed. The letters are in b[k0], b[k0+1] ... 00044 ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted 00045 downwards as the stemming progresses. Zero termination is not in fact 00046 used in the algorithm. 00047 00048 Note that only lower case sequences are stemmed. Forcing to lower case 00049 should be done before stem(...) is called. 00050 */ 00051 #ifndef _PORTER_STEMMER_H_ 00052 #define _PORTER_STEMMER_H_ 00053 #include "indri/Mutex.hpp" 00054 #include "indri/ScopedLock.hpp" 00055 00056 namespace indri 00057 { 00058 namespace parse 00059 { 00060 class Porter_Stemmer 00061 { 00062 private: 00063 indri::thread::Mutex _stemLock; 00064 char * b; /* buffer for word to be stemmed */ 00065 int k,k0,j; /* j is a general offset into the string */ 00066 00067 /* cons(i) is TRUE <=> b[i] is a consonant. */ 00068 bool cons(int i); 00069 00070 /* m() measures the number of consonant sequences between k0 and j. if c is 00071 a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 00072 presence, 00073 00074 <c><v> gives 0 00075 <c>vc<v> gives 1 00076 <c>vcvc<v> gives 2 00077 <c>vcvcvc<v> gives 3 00078 .... 00079 */ 00080 00081 int m(); 00082 00083 /* vowelinstem() is TRUE <=> k0,...j contains a vowel */ 00084 00085 bool vowelinstem(); 00086 00087 /* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */ 00088 00089 bool doublec(int j); 00090 00091 /* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant 00092 and also if the second c is not w,x or y. this is used when trying to 00093 restore an e at the end of a short word. e.g. 00094 00095 cav(e), lov(e), hop(e), crim(e), but 00096 snow, box, tray. 00097 00098 */ 00099 00100 bool cvc(int i); 00101 00102 /* ends(s) is TRUE <=> k0,...k ends with the string s. */ 00103 00104 bool ends(const char * s); 00105 00106 /* setto(s) sets (j+1),...k to the characters in the string s, readjusting 00107 k. */ 00108 00109 void setto(const char * s); 00110 00111 /* r(s) is used further down. */ 00112 00113 void r(const char * s); 00114 00115 /* step1ab() gets rid of plurals and -ed or -ing. e.g. 00116 00117 caresses -> caress 00118 ponies -> poni 00119 ties -> ti 00120 caress -> caress 00121 cats -> cat 00122 00123 feed -> feed 00124 agreed -> agree 00125 disabled -> disable 00126 00127 matting -> mat 00128 mating -> mate 00129 meeting -> meet 00130 milling -> mill 00131 messing -> mess 00132 00133 meetings -> meet 00134 00135 */ 00136 00137 void step1ab(); 00138 00139 /* step1c() turns terminal y to i when there is another vowel in the stem. */ 00140 00141 void step1c(); 00142 00143 00144 /* step2() maps double suffices to single ones. so -ization ( = -ize plus 00145 -ation) maps to -ize etc. note that the string before the suffix must give 00146 m() > 0. */ 00147 00148 void step2(); 00149 00150 /* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */ 00151 00152 void step3(); 00153 00154 /* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */ 00155 00156 void step4(); 00157 00158 /* step5() removes a final -e if m() > 1, and changes -ll to -l if 00159 m() > 1. */ 00160 00161 void step5(); 00162 public: 00163 /* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from 00164 p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last 00165 character of a string, (p[j+1] == '\0'). The stemmer adjusts the 00166 characters p[i] ... p[j] and returns the new end-point of the string, k. 00167 Stemming never increases word length, so i <= k <= j. To turn the stemmer 00168 into a module, declare 'stem' as extern, and delete the remainder of this 00169 file. 00170 */ 00181 int porter_stem(char * p, int i, int j); 00182 }; 00183 } 00184 } 00185 #endif /* _PORTER_STEMMER_H_*/