00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 #include <string>
00016 #include <cstring>
00017 #include <set>
00018 
00019 namespace lemur
00020 {
00021   namespace parse
00022   {
00023     class Arabic_Stemmer 
00024     {
00025     public:
00026       Arabic_Stemmer(std::string stemFunc);
00027       ~Arabic_Stemmer();
00028       
00029       void stemTerm(char *, char *);
00030 
00031     private:
00032       void (Arabic_Stemmer::*stem_fct)(char *, char *) ;
00033       void arabic_remove_diacritics (char *, char *);
00034       
00035       void arabic_stop(char *, char *);  
00036       void no_stem(char *, char *) ;       
00037       
00038       void arabic_norm2(char *, char *);
00039       void arabic_norm2_stop(char *, char *);
00040       void arabic_light10(char *, char *);   
00041       void arabic_light10_stop(char *, char *);         
00042       
00043       struct ltstr {
00044         bool operator()(const char* s1, const char* s2) const {
00045           return strcmp(s1, s2) < 0;
00046         }
00047       };
00048       std::set<const char *, ltstr> stop_words_ht;
00049       bool on_stop_list (char *word);
00050     
00051       typedef struct {
00052         const char *option;
00053         void (Arabic_Stemmer::*stem_fct)(char *, char *) ;
00054       } stem_info_t;
00055       static stem_info_t stemtable[];
00056       static const int ArabicVowel[256];
00057       static const int Norm3Char[256];
00058       static const int NormChar[256];
00059       static const int isWhitespace[256];
00060       static const char *stopwords[];
00061       static const char *suffixes[];
00062       static const char *defarticles[];
00063       int is_whitespace (const char c);
00064       void remove_definite_articles(char *word, char *result);
00065       void remove_all_suffixes(char *word, char *result, size_t lenlimit);
00066     };
00067   }
00068 }
00069 
00070