00001
00006 #include "_SuffixArrayApplicationBase.h"
00007
00008 #include "malloc.h"
00009 #include "time.h"
00010
00011 #include <iostream>
00012 #include <fstream>
00013
00015
00017
00018 C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
00019 {
00020 this->level1Buckets = NULL;
00021 this->noVocabulary = false;
00022 this->noOffset = false;
00023 this->noLevel1Bucket = false;
00024 }
00025
00026 C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
00027 {
00028 if(this->level1Buckets!=NULL){
00029 free(this->level1Buckets);
00030 }
00031
00032
00033 free(this->corpus_list);
00034 free(this->suffix_list);
00035
00036 if(! this->noOffset){
00037 free(this->offset_list);
00038 }
00039
00040 if(! this->noVocabulary){
00041 delete(this->voc);
00042 }
00043 }
00044
00060 void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
00061 {
00062 long ltime1, ltime2;
00063
00064 this->noVocabulary = noVoc;
00065 this->noOffset = noOffset;
00066 this->noLevel1Bucket = noLevel1Bucket;
00067
00068
00069 char tmpString[1000];
00070
00071
00072 if(! this->noVocabulary){
00073 time( <ime1 );
00074 cerr<<"Loading Vocabulary...\n";
00075 sprintf(tmpString,"%s.id_voc",fileNameStem);
00076 this->loadVoc(tmpString);
00077 time( <ime2);
00078 cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
00079 }
00080
00081 time( <ime1 );
00082 cerr<<"Loading corpus...\n";
00083 sprintf(tmpString,"%s.sa_corpus",fileNameStem);
00084 this->loadCorpusAndInitMem(tmpString);
00085 time( <ime2);
00086 cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
00087
00088 time( <ime1 );
00089 cerr<<"Loading suffix...\n";
00090 sprintf(tmpString,"%s.sa_suffix",fileNameStem);
00091 this->loadSuffix(tmpString);
00092 time( <ime2);
00093 cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
00094
00095 if(! this->noOffset){
00096 time( <ime1 );
00097 cerr<<"Loading offset...\n";
00098 sprintf(tmpString,"%s.sa_offset",fileNameStem);
00099 this->loadOffset(tmpString);
00100 time( <ime2);
00101 cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
00102 }
00103 }
00104
00105 void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
00106 {
00107 this->voc = new C_IDVocabulary(filename);
00108 }
00109
00110 void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
00111 {
00112 unsigned int dwRead = 0;
00113 FILE * CorpusInputFile = fopen(filename, "rb");
00114
00115 if(!CorpusInputFile){
00116 cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
00117 exit(0);
00118 }
00119
00120
00121 dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
00122
00123
00124 this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
00125 if(! this->corpus_list){
00126 cerr<<"Can not allocate memory to load the corpus!\n";
00127 exit(0);
00128 }
00129
00130 this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
00131 if(! this->suffix_list){
00132 cerr<<"Can not allocate memory to load the suffix!\n";
00133 exit(0);
00134 }
00135
00136 if(! this->noOffset){
00137 this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
00138 if(! this->offset_list){
00139 cerr<<"Can not allocate memory to load the offset!\n";
00140 exit(0);
00141 }
00142 }
00143
00144
00145 unsigned int totalRead = 0;
00146 unsigned int remaining = this->corpusSize;
00147 unsigned int oneBatchReadSize;
00148 char * currentPosInCorpusList = (char *) this->corpus_list;
00149 while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
00150 oneBatchReadSize = SIZE_ONE_READ;
00151 if(remaining<SIZE_ONE_READ){
00152 oneBatchReadSize = remaining;
00153 }
00154
00155 dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
00156
00157 totalRead+=dwRead;
00158 remaining-=dwRead;
00159
00160 currentPosInCorpusList+=sizeof(IndexType)*dwRead;
00161 }
00162 if(totalRead!=this->corpusSize){
00163 cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
00164 exit(0);
00165 }
00166 fclose(CorpusInputFile);
00167
00168 this->sentIdStart = this->corpus_list[0];
00169 this->vocIdForSentStart = this->corpus_list[1];
00170 this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
00171 this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
00172
00173 if(! this->noLevel1Bucket){
00174
00175
00176 this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);
00177
00178
00179 for(IndexType i=0;i<this->sentIdStart;i++){
00180 this->level1Buckets[i].first = (TextLenType) -1;
00181 this->level1Buckets[i].last = 0;
00182 }
00183 }
00184 }
00185
00186 void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
00187 {
00188 unsigned int dwRead = 0;
00189 FILE * SuffixInputFile = fopen(filename, "rb");
00190 if(!SuffixInputFile){
00191 cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
00192 exit(0);
00193 }
00194
00195
00196 TextLenType suffixArraySize;
00197 dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
00198
00199 if(suffixArraySize!=this->corpusSize){
00200 cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
00201 cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
00202 exit(0);
00203 }
00204
00205
00206 unsigned int totalRead = 0;
00207 unsigned int remaining = suffixArraySize;
00208 unsigned int oneBatchReadSize;
00209 char * currentPosInSuffixList = (char *) this->suffix_list;
00210 while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
00211 oneBatchReadSize = SIZE_ONE_READ;
00212 if(remaining<SIZE_ONE_READ){
00213 oneBatchReadSize = remaining;
00214 }
00215
00216 dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
00217
00218 totalRead+=dwRead;
00219 remaining -= dwRead;
00220
00221 currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
00222 }
00223 if(totalRead!=suffixArraySize){
00224 cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
00225 exit(0);
00226 }
00227
00228 fclose(SuffixInputFile);
00229
00230 if(! this->noLevel1Bucket){
00231
00232 cerr<<"Initialize level-1 buckets...\n";
00233 IndexType currentVocId = 0;
00234 IndexType vocId;
00235 TextLenType pos;
00236 TextLenType lastSaIndex = 0;
00237
00238 for(TextLenType i=0; i<suffixArraySize; i++){
00239 pos = this->suffix_list[i];
00240
00241
00242 vocId = this->corpus_list[pos];
00243
00244 if(vocId<this->sentIdStart){
00245 if(vocId!=currentVocId){
00246 this->level1Buckets[currentVocId].last = lastSaIndex;
00247 this->level1Buckets[vocId].first = i;
00248
00249 currentVocId=vocId;
00250 }
00251
00252 lastSaIndex = i;
00253 }
00254 }
00255
00256
00257 this->level1Buckets[currentVocId].last = lastSaIndex;
00258 }
00259 else{
00260 this->level1Buckets = NULL;
00261 }
00262 }
00263
00264 void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
00265 {
00266 unsigned int dwRead = 0;
00267 FILE * OffsetInputFile = fopen(filename, "rb");
00268
00269 if(!OffsetInputFile){
00270 cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
00271 exit(0);
00272 }
00273
00274
00275 TextLenType offsetListLen;
00276 dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);
00277 if(offsetListLen!=this->corpusSize){
00278 cerr<<"Text length is inconsistent with the length of the offset.\n";
00279 exit(0);
00280 }
00281
00282
00283 unsigned int totalRead = 0;
00284 unsigned int remaining = offsetListLen;
00285 unsigned int oneBatchReadSize;
00286 char * currentOffsetListPos = (char *) this->offset_list;
00287 while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
00288 oneBatchReadSize = SIZE_ONE_READ;
00289
00290 if(remaining<SIZE_ONE_READ){
00291 oneBatchReadSize = remaining;
00292 }
00293
00294 dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
00295
00296 totalRead+=dwRead;
00297 remaining-=dwRead;
00298
00299 currentOffsetListPos+=sizeof(unsigned char)*dwRead;
00300
00301 }
00302 if(totalRead!=offsetListLen){
00303 cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
00304 exit(0);
00305 }
00306 fclose(OffsetInputFile);
00307
00308 }
00309
00310 TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
00311 {
00312 return this->corpusSize;
00313 }