00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include <string>
00016 #include <cstring>
00017 #include <set>
00018
00019 namespace lemur
00020 {
00021 namespace parse
00022 {
00023 class Arabic_Stemmer
00024 {
00025 public:
00026 Arabic_Stemmer(std::string stemFunc);
00027 ~Arabic_Stemmer();
00028
00029 void stemTerm(char *, char *);
00030
00031 private:
00032 void (Arabic_Stemmer::*stem_fct)(char *, char *) ;
00033 void arabic_remove_diacritics (char *, char *);
00034
00035 void arabic_stop(char *, char *);
00036 void no_stem(char *, char *) ;
00037
00038 void arabic_norm2(char *, char *);
00039 void arabic_norm2_stop(char *, char *);
00040 void arabic_light10(char *, char *);
00041 void arabic_light10_stop(char *, char *);
00042
00043 struct ltstr {
00044 bool operator()(const char* s1, const char* s2) const {
00045 return strcmp(s1, s2) < 0;
00046 }
00047 };
00048 std::set<const char *, ltstr> stop_words_ht;
00049 bool on_stop_list (char *word);
00050
00051 typedef struct {
00052 const char *option;
00053 void (Arabic_Stemmer::*stem_fct)(char *, char *) ;
00054 } stem_info_t;
00055 static stem_info_t stemtable[];
00056 static const int ArabicVowel[256];
00057 static const int Norm3Char[256];
00058 static const int NormChar[256];
00059 static const int isWhitespace[256];
00060 static const char *stopwords[];
00061 static const char *suffixes[];
00062 static const char *defarticles[];
00063 int is_whitespace (const char c);
00064 void remove_definite_articles(char *word, char *result);
00065 void remove_all_suffixes(char *word, char *result, size_t lenlimit);
00066 };
00067 }
00068 }
00069
00070