_IDVocabulary.cpp

Go to the documentation of this file.
00001 
00008 #include "_IDVocabulary.h"
00009 #include <fstream>
00010 #include <iostream>
00011 
00012 using namespace std;
00013 
00015 // Construction/Destruction
00017 
00018 C_IDVocabulary::C_IDVocabulary()
00019 {
00020         this->maxIdInVoc = 0;   
00021 }
00022 
00023 C_IDVocabulary::C_IDVocabulary(const char * fileName)
00024 {
00025                 
00026         this->maxIdInVoc = 0;   
00027 
00028         this->loadFromFile(fileName);
00029 }
00030 
00031 C_IDVocabulary::~C_IDVocabulary()
00032 {
00033 
00034 }
00035 
00038 IndexType C_IDVocabulary::returnId(C_String text)
00039 {
00040         IndexType id;
00041         
00042         map<C_String, IndexType, ltstr>::iterator iterText2Id;
00043         iterText2Id = this->text2id.find(text);
00044 
00045         if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
00046                 id = 0;
00047         }
00048         else{
00049                 id = iterText2Id->second;
00050         }
00051         
00052         return id;
00053 }
00054 
00057 C_String C_IDVocabulary::getText(IndexType id)
00058 {
00059         map<IndexType, C_String>::iterator iterId2Text;
00060         iterId2Text = this->id2text.find(id);
00061 
00062         if(iterId2Text==this->id2text.end()){
00063                 return C_String("<UNK>");
00064         }
00065 
00066         return iterId2Text->second;
00067 }
00068 
00069 IndexType C_IDVocabulary::getSize()
00070 {
00071         return this->text2id.size();
00072 }
00073 
00074 
00078 //      in each line.
00079 void C_IDVocabulary::loadFromFile(const char *fileName)
00080 {
00081 
00082         ifstream existingVocFile;
00083         existingVocFile.open(fileName);
00084 
00085         if(!existingVocFile){
00086                 cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
00087                 exit(0);
00088         }
00089 
00090         cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
00091 
00092         char aLine[1024];
00093         char * aToken;
00094         char delimit[] = " \t\r\n";     
00095         IndexType vocId = 0;
00096         
00097         while(!existingVocFile.eof()){
00098                 existingVocFile.getline(aLine, 1024, '\n');
00099                 
00100                 if(strlen(aLine)>0){    //a meaningful word, esp for the last line during reading file
00101                                 vector<C_String> tokensInLine;
00102 
00103                                 aToken = strtok(aLine, delimit);                                
00104                                 while( aToken != NULL ) {       
00105                                         tokensInLine.push_back(C_String(aToken));
00106                                         aToken = strtok( NULL, delimit);
00107                                 }
00108                                 
00109                                 if(tokensInLine.size()!=2){
00110                                         cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
00111                                 }
00112                                 
00113                                 vocId = atoi(tokensInLine[1].toString());
00114 
00115                                 if(vocId>this->maxIdInVoc){
00116                                         this->maxIdInVoc = vocId;
00117                                 }
00118 
00119                                 this->text2id.insert(make_pair(tokensInLine[0], vocId));
00120                                 this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
00121                 
00122                 }
00123                 
00124                 aLine[0]=0;
00125         }
00126         cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
00127         cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
00128 }
00129 
00134 IndexType C_IDVocabulary::returnMaxID()
00135 {
00136         return this->maxIdInVoc;
00137 }
00138 
00139 IndexType C_IDVocabulary::returnNullWordID()
00140 {
00141         return 0;
00142 }
00143 
00147 void C_IDVocabulary::outputToFile(char *filename)
00148 {
00149 
00150         ofstream outputVocFile;
00151         outputVocFile.open(filename);
00152 
00153         if(!outputVocFile){
00154                 cerr<<"Can not open "<<filename<<" to write vocabulary\n";
00155                 exit(-1);
00156         }
00157 
00158         map<C_String, IndexType, ltstr>::iterator iterText2Id;
00159 
00160         iterText2Id = this->text2id.begin();
00161         while(iterText2Id!=this->text2id.end()){
00162                 outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
00163                 iterText2Id++;
00164         }
00165 
00166         outputVocFile.close();
00167 }
00168 
00177 void C_IDVocabulary::addingReservedWords()
00178 {
00179         this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
00180         this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
00181         this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
00182         this->insertWord(C_String("_SENTENCE_START_"), 4);
00183         this->insertWord(C_String("_END_OF_CORPUS_"), 5);
00184         
00185         char reservedWord[20];
00186         for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
00187                 memset(reservedWord, 0, 20);
00188                 sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
00189                 this->insertWord(C_String(reservedWord), i);
00190         }
00191 }
00192 
00193 void C_IDVocabulary::insertWord(C_String text, IndexType id)
00194 {
00195         this->text2id.insert(make_pair(text, id));
00196         this->id2text.insert(make_pair(id, text));
00197 
00198 }
00199 
00205 IndexType C_IDVocabulary::getId(C_String text)
00206 {
00207         IndexType id = this->returnId(text);
00208         if(id==0){
00209                 this->maxIdInVoc++;
00210                 this->insertWord(text, this->maxIdInVoc);
00211                 return this->maxIdInVoc;
00212         }
00213 
00214         //else, already exist
00215         return id;
00216 }

Generated on Fri Jul 6 23:11:06 2007 for SALM by  doxygen 1.5.1