_SuffixArrayScanningBase.cpp

Go to the documentation of this file.
00001 
00006 #include "_SuffixArrayScanningBase.h"
00007 #include <iostream>
00008 
00009 using namespace std;
00010 
00011 C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
00012 {
00013         this->countOfCountsTable = 0;   //no memory has been allocated
00014         this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
00015 }
00016 
00017 C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
00018 {
00019         this->countOfCountsTable = 0;  //no memory has been allocated
00020         this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
00021 
00022         //load suffix array
00023         this->loadData(filename, false, true, true);
00024 
00025         this->initializeForScanning(filename, maxN);
00026 }
00027 
00028 void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
00029 {
00030         this->maxFreqConsidered = maxFreqConsidered;
00031 }
00032 
00033 
00037 void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
00038 {
00039         this->maxN = maxN;
00040         this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
00041         this->countOfCountsTable = 0;   //no memory has been allocated
00042 
00043         //initialize the scanning list
00044         for(int i=0;i<this->maxN;i++){
00045                 this->nGramScanningList[i].freqSoFar=0;
00046                 this->nGramScanningList[i].vocId = 0;
00047                 this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;     //default, do not output
00048         }
00049 
00050         //get vocID for sentEnd
00051         this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
00052 
00053         if(this->vocIdForSentEnd==0){
00054                 cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
00055                 exit(0);
00056         }
00057 
00058         this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
00059         if(this->vocIdForSentStart==0){
00060                 cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
00061                 exit(0);
00062         }
00063         
00064         this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
00065         if(this->vocIdForCorpusEnd==0){
00066                 cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
00067                 exit(0);
00068         }
00069 }
00070 
00071 C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
00072 {
00073         free(this->nGramScanningList);
00074 
00075         if(this->countOfCountsTable!=0){
00076                 free(this->countOfCountsTable);
00077         }
00078 
00079 }
00080 
00081 void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
00082 {
00083         if(n>this->maxN){
00084                 cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
00085                 exit(0);
00086         }
00087 
00088         this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
00089 }
00090 
00091 void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
00092 {
00093         this->scanSuffixArray('H');
00094 
00095 }
00096 
00100 void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
00101 {
00102         this->maxFreqConsidered = maxFreqConsidered;
00103         this->constructCountOfCountsTable();
00104         
00105         //output the count of counts
00106         cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
00107         for(int i=0;i<this->maxN;i++){
00108                 cout<<i+1<<endl;
00109                 
00110                 unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
00111                 for(int freq=0;freq<maxFreqConsidered;freq++){
00112                         cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
00113                 }
00114         }
00115         
00116 }
00117 
00120 void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
00121 {
00122         this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
00123         this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
00124 
00125         //initialize
00126         for(int n=0;n<maxN;n++){
00127                 this->typeFreq[n]=0;
00128                 this->tokenFreq[n]=0;
00129         }
00130 
00131 
00132         //scan the suffix array
00133         this->scanSuffixArray('T');
00134 
00135         //output
00136         cout<<"n\tType\tToken\n";
00137         for(int i=0;i<this->maxN;i++){
00138                 cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
00139         }
00140 }
00141 
00146 void C_SuffixArrayScanningBase::constructCountOfCountsTable()
00147 {
00148         if(this->countOfCountsTable!=0){        //if there is already a count of counts table
00149                 free(this->countOfCountsTable);
00150         }
00151 
00152         this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
00153 
00154         if(this->countOfCountsTable==NULL){
00155                 cerr<<"Count of counts table can not be initialized. Exit\n";
00156                 exit(0);
00157         }
00158 
00159         for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
00160                 this->countOfCountsTable[c]=0;
00161         }
00162 
00163         this->scanSuffixArray('C');
00164 
00165 
00166 }
00167 
00172 void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
00173 {
00174         
00175         int i,j;
00176         bool stillMeaningful = true;    
00177         TextLenType saPos=0;
00178 
00179         while(stillMeaningful && ( saPos<this->corpusSize ) ){
00180 
00181                 TextLenType posInCorpus = this->suffix_list[saPos];
00182                 IndexType wordInCorpus = this->corpus_list[posInCorpus];
00183 
00184                 if(wordInCorpus<this->sentIdStart){     //SA positions pointing to sentID are not interesting
00185                         
00186                         if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){    //n-grams start with <s> and </s>, or <end of corpus> are not interested
00187                         
00188                                 bool quit =false;
00189                                 i=0;
00190 
00191                                 while(!quit && (i<this->maxN)){
00192                                         wordInCorpus = this->corpus_list[posInCorpus+i];
00193                                         if(                                             
00194                                                 (wordInCorpus<this->sentIdStart)&&
00195                                                 (wordInCorpus!=this->vocIdForSentEnd)&&
00196                                                 (wordInCorpus!=this->vocIdForSentStart)&&
00197                                                 (wordInCorpus==this->nGramScanningList[i].vocId)){      //still match
00198 
00199                                                 this->nGramScanningList[i].freqSoFar++;
00200                                         }
00201                                         else{   //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
00202                                                                         
00203                                                 bool validNgramUpSoFar = true;
00204                                                 unsigned int freqSoFar;
00205                                                 C_String tmpPhrase; //for output high freq n-grams
00206 
00207                                                 //prepare the prefix of the n-grams
00208                                                 if(actionType=='H'){
00209                                                         //common i-gram
00210                                                         for(j=0;j<=i-1;j++){
00211                                                                 if(this->nGramScanningList[j].vocId==0){        //one of the word in the common i-gram is a NULL word, not a valid n-gram
00212                                                                         validNgramUpSoFar = false;
00213                                                                 }
00214                                                                 tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
00215                                                                 tmpPhrase.appending(C_String(" "));
00216                                                         }       
00217                                                 }
00218 
00219 
00220                                                 for(j=i;j<this->maxN;j++){                              
00221                                                         
00222                                                         
00223                                                         if(this->nGramScanningList[j].vocId==0){                //a NULL word, then this n-gram and longer ones in the scan window are invalid
00224                                                                 validNgramUpSoFar = false;
00225                                                         }
00226 
00227                                                         if(validNgramUpSoFar){          //perform actions depends on actionType
00228                                                                 
00229                                                                 switch(actionType){
00230 
00231                                                                 case 'C':       //count of counts
00232                                                                         freqSoFar = this->nGramScanningList[j].freqSoFar;
00233                                                                         if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
00234                                                                                 //increase the count for (j+1)-gram with freq freqSoFar
00235                                                                                 this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
00236                                                                         }
00237                                                                         break;
00238 
00239                                                                 case 'H':       //output high-freq n-grams
00240                                                                         tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
00241                                                                         tmpPhrase.appending(C_String(" "));
00242 
00243                                                                         if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){                                                       
00244                                                                                 cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
00245                                                                         }
00246                                                                         break;
00247 
00248                                                                 case 'T':       //type-token statistics
00249                                                                         if(this->nGramScanningList[j].freqSoFar>0){
00250                                                                                 typeFreq[j]++;
00251                                                                         }
00252 
00253                                                                         tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
00254 
00255                                                                         break;  
00256                                                                 default: 
00257                                                                         cerr<<"Unknown action!\n";
00258                                                                         exit(-1);
00259                                                                 }
00260                                                         }
00261 
00262                                                         //finished output, now clear the list from point of i
00263                                                         if((posInCorpus+j)<this->corpusSize){
00264                                                                 wordInCorpus = this->corpus_list[posInCorpus+j];
00265                                                         }
00266                                                         else{
00267                                                                 wordInCorpus = 0;       //out of bound for corpus
00268                                                         }
00269 
00270                                                         if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
00271                                                                 wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
00272                                                                 this->nGramScanningList[j].freqSoFar = 0;
00273                                                         }
00274                                                         else{
00275                                                                 this->nGramScanningList[j].freqSoFar = 1;
00276                                                         }
00277 
00278                                                         this->nGramScanningList[j].vocId = wordInCorpus;                                                        
00279                                                 }
00280 
00281                                                 quit=true;      //at i+1 gram, already not match, no need to check for longer
00282                                         }
00283 
00284                                         i++;
00285                                 }
00286                         }
00287                 }
00288                 else{
00289                         stillMeaningful = false;        //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
00290                 }
00291 
00292                 saPos++;
00293         }
00294 
00295         //at the end of corpus (according to suffix order)
00296         C_String finalTmpString;        //for output high-freq n-gram type
00297         bool validNgramUpSoFar = true;
00298         unsigned int freqSoFar;
00299         for(i=0;i<this->maxN;i++){
00300                 if(this->nGramScanningList[i].vocId==0){        //invalide word
00301                         validNgramUpSoFar = false;
00302                 }
00303 
00304                 if(validNgramUpSoFar){
00305                         switch(actionType){
00306                         case 'C':       //for count-of-counts
00307                                 freqSoFar = this->nGramScanningList[i].freqSoFar;
00308                                 if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
00309                                         //increase the count for (i+1)-gram with freq freqSoFar
00310                                         this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
00311                                 }
00312                                 break;
00313 
00314                         case 'H':       //for high-freq n-gram types
00315                                 finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
00316                                 finalTmpString.appending(C_String(" "));
00317                                 if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){                        
00318                                         cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
00319                                 }
00320                                 break;
00321 
00322                         case 'T':       //for type-token statistics
00323                                 if(this->nGramScanningList[i].freqSoFar>0){
00324                                         typeFreq[i]++;
00325                                 }
00326 
00327                                 tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
00328                                 break;
00329 
00330                         default: 
00331                                 cerr<<"Unknown action!\n";
00332                                 exit(-1);
00333                         }
00334                 }
00335         }
00336 
00337 }

Generated on Fri Jul 6 23:11:08 2007 for SALM by  doxygen 1.5.1