00001
00008 #include "_IDVocabulary.h"
00009 #include <fstream>
00010 #include <iostream>
00011
00012 using namespace std;
00013
00015
00017
00018 C_IDVocabulary::C_IDVocabulary()
00019 {
00020 this->maxIdInVoc = 0;
00021 }
00022
00023 C_IDVocabulary::C_IDVocabulary(const char * fileName)
00024 {
00025
00026 this->maxIdInVoc = 0;
00027
00028 this->loadFromFile(fileName);
00029 }
00030
00031 C_IDVocabulary::~C_IDVocabulary()
00032 {
00033
00034 }
00035
00038 IndexType C_IDVocabulary::returnId(C_String text)
00039 {
00040 IndexType id;
00041
00042 map<C_String, IndexType, ltstr>::iterator iterText2Id;
00043 iterText2Id = this->text2id.find(text);
00044
00045 if(iterText2Id==this->text2id.end()){
00046 id = 0;
00047 }
00048 else{
00049 id = iterText2Id->second;
00050 }
00051
00052 return id;
00053 }
00054
00057 C_String C_IDVocabulary::getText(IndexType id)
00058 {
00059 map<IndexType, C_String>::iterator iterId2Text;
00060 iterId2Text = this->id2text.find(id);
00061
00062 if(iterId2Text==this->id2text.end()){
00063 return C_String("<UNK>");
00064 }
00065
00066 return iterId2Text->second;
00067 }
00068
00069 IndexType C_IDVocabulary::getSize()
00070 {
00071 return this->text2id.size();
00072 }
00073
00074
00078
00079 void C_IDVocabulary::loadFromFile(const char *fileName)
00080 {
00081
00082 ifstream existingVocFile;
00083 existingVocFile.open(fileName);
00084
00085 if(!existingVocFile){
00086 cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
00087 exit(0);
00088 }
00089
00090 cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
00091
00092 char aLine[1024];
00093 char * aToken;
00094 char delimit[] = " \t\r\n";
00095 IndexType vocId = 0;
00096
00097 while(!existingVocFile.eof()){
00098 existingVocFile.getline(aLine, 1024, '\n');
00099
00100 if(strlen(aLine)>0){
00101 vector<C_String> tokensInLine;
00102
00103 aToken = strtok(aLine, delimit);
00104 while( aToken != NULL ) {
00105 tokensInLine.push_back(C_String(aToken));
00106 aToken = strtok( NULL, delimit);
00107 }
00108
00109 if(tokensInLine.size()!=2){
00110 cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
00111 }
00112
00113 vocId = atoi(tokensInLine[1].toString());
00114
00115 if(vocId>this->maxIdInVoc){
00116 this->maxIdInVoc = vocId;
00117 }
00118
00119 this->text2id.insert(make_pair(tokensInLine[0], vocId));
00120 this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
00121
00122 }
00123
00124 aLine[0]=0;
00125 }
00126 cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
00127 cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
00128 }
00129
00134 IndexType C_IDVocabulary::returnMaxID()
00135 {
00136 return this->maxIdInVoc;
00137 }
00138
00139 IndexType C_IDVocabulary::returnNullWordID()
00140 {
00141 return 0;
00142 }
00143
00147 void C_IDVocabulary::outputToFile(char *filename)
00148 {
00149
00150 ofstream outputVocFile;
00151 outputVocFile.open(filename);
00152
00153 if(!outputVocFile){
00154 cerr<<"Can not open "<<filename<<" to write vocabulary\n";
00155 exit(-1);
00156 }
00157
00158 map<C_String, IndexType, ltstr>::iterator iterText2Id;
00159
00160 iterText2Id = this->text2id.begin();
00161 while(iterText2Id!=this->text2id.end()){
00162 outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
00163 iterText2Id++;
00164 }
00165
00166 outputVocFile.close();
00167 }
00168
00177 void C_IDVocabulary::addingReservedWords()
00178 {
00179 this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
00180 this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
00181 this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
00182 this->insertWord(C_String("_SENTENCE_START_"), 4);
00183 this->insertWord(C_String("_END_OF_CORPUS_"), 5);
00184
00185 char reservedWord[20];
00186 for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
00187 memset(reservedWord, 0, 20);
00188 sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
00189 this->insertWord(C_String(reservedWord), i);
00190 }
00191 }
00192
00193 void C_IDVocabulary::insertWord(C_String text, IndexType id)
00194 {
00195 this->text2id.insert(make_pair(text, id));
00196 this->id2text.insert(make_pair(id, text));
00197
00198 }
00199
00205 IndexType C_IDVocabulary::getId(C_String text)
00206 {
00207 IndexType id = this->returnId(text);
00208 if(id==0){
00209 this->maxIdInVoc++;
00210 this->insertWord(text, this->maxIdInVoc);
00211 return this->maxIdInVoc;
00212 }
00213
00214
00215 return id;
00216 }