_SuffixArrayApplicationBase.cpp

Go to the documentation of this file.
00001 
00006 #include "_SuffixArrayApplicationBase.h"
00007 
00008 #include "malloc.h"
00009 #include "time.h"
00010 
00011 #include <iostream>
00012 #include <fstream>
00013 
00015 // Construction/Destruction
00017 
00018 C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
00019 {
00020         this->level1Buckets = NULL;
00021         this->noVocabulary = false;     //by default, still load the vocabulary
00022         this->noOffset = false; //by default, load offset
00023         this->noLevel1Bucket = false;   //by default, construct level1 bucket
00024 }
00025 
00026 C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
00027 {
00028         if(this->level1Buckets!=NULL){
00029                 free(this->level1Buckets);
00030         }
00031 
00032         //not necessary too
00033         free(this->corpus_list);
00034         free(this->suffix_list);
00035 
00036         if(! this->noOffset){
00037                 free(this->offset_list);
00038         }
00039 
00040         if(! this->noVocabulary){
00041                 delete(this->voc);
00042         }
00043 }
00044 
00060 void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
00061 {
00062         long ltime1, ltime2;
00063 
00064         this->noVocabulary = noVoc;
00065         this->noOffset = noOffset;
00066         this->noLevel1Bucket = noLevel1Bucket;
00067         
00068         
00069         char tmpString[1000];
00070 
00071         //the order of loading the data is important, do not change
00072         if(! this->noVocabulary){
00073                 time( &ltime1 );
00074                 cerr<<"Loading Vocabulary...\n";
00075                 sprintf(tmpString,"%s.id_voc",fileNameStem);
00076                 this->loadVoc(tmpString);
00077                 time( &ltime2);
00078                 cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
00079         }
00080         
00081         time( &ltime1 );
00082         cerr<<"Loading corpus...\n";
00083         sprintf(tmpString,"%s.sa_corpus",fileNameStem); 
00084         this->loadCorpusAndInitMem(tmpString);
00085         time( &ltime2);
00086         cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
00087         
00088         time( &ltime1 );
00089         cerr<<"Loading suffix...\n";
00090         sprintf(tmpString,"%s.sa_suffix",fileNameStem);
00091         this->loadSuffix(tmpString);
00092         time( &ltime2);
00093         cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
00094 
00095         if(! this->noOffset){
00096                 time( &ltime1 );
00097                 cerr<<"Loading offset...\n";
00098                 sprintf(tmpString,"%s.sa_offset",fileNameStem);
00099                 this->loadOffset(tmpString);
00100                 time( &ltime2);
00101                 cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
00102         }
00103 }
00104 
00105 void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
00106 {
00107         this->voc =  new C_IDVocabulary(filename);
00108 }
00109 
00110 void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
00111 {
00112         unsigned int dwRead = 0;
00113         FILE *  CorpusInputFile = fopen(filename, "rb");
00114 
00115         if(!CorpusInputFile){
00116                 cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
00117                 exit(0);
00118         }
00119         
00120         //first, read the size of the corpus
00121         dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
00122         
00123         //allocate memory for all data structure
00124         this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
00125         if(! this->corpus_list){
00126                 cerr<<"Can not allocate memory to load the corpus!\n";
00127                 exit(0);
00128         }
00129 
00130         this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
00131         if(! this->suffix_list){
00132                 cerr<<"Can not allocate memory to load the suffix!\n";
00133                 exit(0);
00134         }
00135 
00136         if(! this->noOffset){
00137                 this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
00138                 if(! this->offset_list){
00139                         cerr<<"Can not allocate memory to load the offset!\n";
00140                         exit(0);
00141                 }
00142         }
00143 
00144         //read the corpus file
00145         unsigned int totalRead = 0;
00146         unsigned int remaining = this->corpusSize;
00147         unsigned int oneBatchReadSize;
00148         char * currentPosInCorpusList = (char *) this->corpus_list;
00149         while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
00150                 oneBatchReadSize = SIZE_ONE_READ;
00151                 if(remaining<SIZE_ONE_READ){
00152                         oneBatchReadSize = remaining;
00153                 }
00154 
00155                 dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
00156                                 
00157                 totalRead+=dwRead;
00158                 remaining-=dwRead;
00159 
00160                 currentPosInCorpusList+=sizeof(IndexType)*dwRead;
00161         }
00162         if(totalRead!=this->corpusSize){
00163                 cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
00164                 exit(0);
00165         }
00166         fclose(CorpusInputFile);
00167 
00168         this->sentIdStart = this->corpus_list[0];
00169         this->vocIdForSentStart = this->corpus_list[1];
00170         this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
00171         this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
00172 
00173         if(! this->noLevel1Bucket){
00174                 //in this corpus, we will have at most sentIdStart-1 word types
00175                 //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
00176                 this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);       
00177                 
00178                 //initialize the level1 buckets
00179                 for(IndexType i=0;i<this->sentIdStart;i++){
00180                         this->level1Buckets[i].first = (TextLenType) -1;
00181                         this->level1Buckets[i].last = 0;
00182                 }
00183         }
00184 }
00185 
00186 void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
00187 {
00188         unsigned int dwRead = 0;
00189         FILE *  SuffixInputFile = fopen(filename, "rb");
00190         if(!SuffixInputFile){
00191                 cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
00192                 exit(0);
00193         }
00194 
00195         //first, read in the size of the suffix array
00196         TextLenType suffixArraySize;
00197         dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
00198         
00199         if(suffixArraySize!=this->corpusSize){
00200                 cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
00201                 cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
00202                 exit(0);
00203         }
00204 
00205         //read all the suffix into memory
00206         unsigned int totalRead = 0;
00207         unsigned int remaining = suffixArraySize;
00208         unsigned int oneBatchReadSize;
00209         char * currentPosInSuffixList = (char *) this->suffix_list;
00210         while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
00211                 oneBatchReadSize = SIZE_ONE_READ;
00212                 if(remaining<SIZE_ONE_READ){
00213                         oneBatchReadSize = remaining;
00214                 }
00215 
00216                 dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
00217                 
00218                 totalRead+=dwRead;
00219                 remaining -= dwRead;
00220 
00221                 currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
00222         }       
00223         if(totalRead!=suffixArraySize){
00224                 cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
00225                 exit(0);
00226         }
00227 
00228         fclose(SuffixInputFile);
00229 
00230         if(! this->noLevel1Bucket){
00231                 //build level-1 bucket
00232                 cerr<<"Initialize level-1 buckets...\n";
00233                 IndexType currentVocId = 0;
00234                 IndexType vocId;
00235                 TextLenType pos;
00236                 TextLenType lastSaIndex = 0;
00237                 
00238                 for(TextLenType i=0; i<suffixArraySize; i++){
00239                         pos = this->suffix_list[i];
00240                         
00241                         //for level1 bucket
00242                         vocId = this->corpus_list[pos];
00243 
00244                         if(vocId<this->sentIdStart){    //is a meaningful word type
00245                                 if(vocId!=currentVocId){
00246                                         this->level1Buckets[currentVocId].last = lastSaIndex;   //for first word which is <unk> this does not matter
00247                                         this->level1Buckets[vocId].first = i;
00248                                         
00249                                         currentVocId=vocId;                             
00250                                 }
00251 
00252                                 lastSaIndex = i;
00253                         }       
00254                 }
00255 
00256                 //for the last word type
00257                 this->level1Buckets[currentVocId].last = lastSaIndex;
00258         }
00259         else{
00260                 this->level1Buckets = NULL;
00261         }
00262 }
00263 
00264 void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
00265 {
00266         unsigned int dwRead = 0;
00267         FILE *  OffsetInputFile = fopen(filename, "rb");
00268         
00269         if(!OffsetInputFile){
00270                 cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
00271                 exit(0);
00272         }
00273                 
00274         //first, read the size of the corpus    
00275         TextLenType offsetListLen;
00276         dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);       
00277         if(offsetListLen!=this->corpusSize){
00278                 cerr<<"Text length is inconsistent with the length of the offset.\n";
00279                 exit(0);
00280         }
00281 
00282         //read all the suffix into memory
00283         unsigned int totalRead = 0;
00284         unsigned int remaining = offsetListLen;
00285         unsigned int oneBatchReadSize;
00286         char * currentOffsetListPos = (char *) this->offset_list;
00287         while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
00288                 oneBatchReadSize = SIZE_ONE_READ;
00289 
00290                 if(remaining<SIZE_ONE_READ){
00291                         oneBatchReadSize = remaining;
00292                 }
00293 
00294                 dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
00295                 
00296                 totalRead+=dwRead;
00297                 remaining-=dwRead;
00298 
00299                 currentOffsetListPos+=sizeof(unsigned char)*dwRead;
00300 
00301         }
00302         if(totalRead!=offsetListLen){
00303                 cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
00304                 exit(0);
00305         }
00306         fclose(OffsetInputFile);
00307         
00308 }
00309 
00310 TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
00311 {
00312     return this->corpusSize;
00313 }

Generated on Fri Jul 6 23:11:07 2007 for SALM by  doxygen 1.5.1