00001
00006 #include "_SuffixArrayScanningBase.h"
00007 #include <iostream>
00008
00009 using namespace std;
00010
00011 C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
00012 {
00013 this->countOfCountsTable = 0;
00014 this->maxFreqConsidered = 1000;
00015 }
00016
00017 C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
00018 {
00019 this->countOfCountsTable = 0;
00020 this->maxFreqConsidered = 1000;
00021
00022
00023 this->loadData(filename, false, true, true);
00024
00025 this->initializeForScanning(filename, maxN);
00026 }
00027
00028 void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
00029 {
00030 this->maxFreqConsidered = maxFreqConsidered;
00031 }
00032
00033
00037 void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
00038 {
00039 this->maxN = maxN;
00040 this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
00041 this->countOfCountsTable = 0;
00042
00043
00044 for(int i=0;i<this->maxN;i++){
00045 this->nGramScanningList[i].freqSoFar=0;
00046 this->nGramScanningList[i].vocId = 0;
00047 this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;
00048 }
00049
00050
00051 this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
00052
00053 if(this->vocIdForSentEnd==0){
00054 cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
00055 exit(0);
00056 }
00057
00058 this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
00059 if(this->vocIdForSentStart==0){
00060 cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
00061 exit(0);
00062 }
00063
00064 this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
00065 if(this->vocIdForCorpusEnd==0){
00066 cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
00067 exit(0);
00068 }
00069 }
00070
00071 C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
00072 {
00073 free(this->nGramScanningList);
00074
00075 if(this->countOfCountsTable!=0){
00076 free(this->countOfCountsTable);
00077 }
00078
00079 }
00080
00081 void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
00082 {
00083 if(n>this->maxN){
00084 cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
00085 exit(0);
00086 }
00087
00088 this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
00089 }
00090
00091 void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
00092 {
00093 this->scanSuffixArray('H');
00094
00095 }
00096
00100 void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
00101 {
00102 this->maxFreqConsidered = maxFreqConsidered;
00103 this->constructCountOfCountsTable();
00104
00105
00106 cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
00107 for(int i=0;i<this->maxN;i++){
00108 cout<<i+1<<endl;
00109
00110 unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
00111 for(int freq=0;freq<maxFreqConsidered;freq++){
00112 cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
00113 }
00114 }
00115
00116 }
00117
00120 void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
00121 {
00122 this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
00123 this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
00124
00125
00126 for(int n=0;n<maxN;n++){
00127 this->typeFreq[n]=0;
00128 this->tokenFreq[n]=0;
00129 }
00130
00131
00132
00133 this->scanSuffixArray('T');
00134
00135
00136 cout<<"n\tType\tToken\n";
00137 for(int i=0;i<this->maxN;i++){
00138 cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
00139 }
00140 }
00141
00146 void C_SuffixArrayScanningBase::constructCountOfCountsTable()
00147 {
00148 if(this->countOfCountsTable!=0){
00149 free(this->countOfCountsTable);
00150 }
00151
00152 this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
00153
00154 if(this->countOfCountsTable==NULL){
00155 cerr<<"Count of counts table can not be initialized. Exit\n";
00156 exit(0);
00157 }
00158
00159 for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
00160 this->countOfCountsTable[c]=0;
00161 }
00162
00163 this->scanSuffixArray('C');
00164
00165
00166 }
00167
00172 void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
00173 {
00174
00175 int i,j;
00176 bool stillMeaningful = true;
00177 TextLenType saPos=0;
00178
00179 while(stillMeaningful && ( saPos<this->corpusSize ) ){
00180
00181 TextLenType posInCorpus = this->suffix_list[saPos];
00182 IndexType wordInCorpus = this->corpus_list[posInCorpus];
00183
00184 if(wordInCorpus<this->sentIdStart){
00185
00186 if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){
00187
00188 bool quit =false;
00189 i=0;
00190
00191 while(!quit && (i<this->maxN)){
00192 wordInCorpus = this->corpus_list[posInCorpus+i];
00193 if(
00194 (wordInCorpus<this->sentIdStart)&&
00195 (wordInCorpus!=this->vocIdForSentEnd)&&
00196 (wordInCorpus!=this->vocIdForSentStart)&&
00197 (wordInCorpus==this->nGramScanningList[i].vocId)){
00198
00199 this->nGramScanningList[i].freqSoFar++;
00200 }
00201 else{
00202
00203 bool validNgramUpSoFar = true;
00204 unsigned int freqSoFar;
00205 C_String tmpPhrase;
00206
00207
00208 if(actionType=='H'){
00209
00210 for(j=0;j<=i-1;j++){
00211 if(this->nGramScanningList[j].vocId==0){
00212 validNgramUpSoFar = false;
00213 }
00214 tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
00215 tmpPhrase.appending(C_String(" "));
00216 }
00217 }
00218
00219
00220 for(j=i;j<this->maxN;j++){
00221
00222
00223 if(this->nGramScanningList[j].vocId==0){
00224 validNgramUpSoFar = false;
00225 }
00226
00227 if(validNgramUpSoFar){
00228
00229 switch(actionType){
00230
00231 case 'C':
00232 freqSoFar = this->nGramScanningList[j].freqSoFar;
00233 if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
00234
00235 this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
00236 }
00237 break;
00238
00239 case 'H':
00240 tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
00241 tmpPhrase.appending(C_String(" "));
00242
00243 if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){
00244 cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
00245 }
00246 break;
00247
00248 case 'T':
00249 if(this->nGramScanningList[j].freqSoFar>0){
00250 typeFreq[j]++;
00251 }
00252
00253 tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
00254
00255 break;
00256 default:
00257 cerr<<"Unknown action!\n";
00258 exit(-1);
00259 }
00260 }
00261
00262
00263 if((posInCorpus+j)<this->corpusSize){
00264 wordInCorpus = this->corpus_list[posInCorpus+j];
00265 }
00266 else{
00267 wordInCorpus = 0;
00268 }
00269
00270 if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
00271 wordInCorpus=0;
00272 this->nGramScanningList[j].freqSoFar = 0;
00273 }
00274 else{
00275 this->nGramScanningList[j].freqSoFar = 1;
00276 }
00277
00278 this->nGramScanningList[j].vocId = wordInCorpus;
00279 }
00280
00281 quit=true;
00282 }
00283
00284 i++;
00285 }
00286 }
00287 }
00288 else{
00289 stillMeaningful = false;
00290 }
00291
00292 saPos++;
00293 }
00294
00295
00296 C_String finalTmpString;
00297 bool validNgramUpSoFar = true;
00298 unsigned int freqSoFar;
00299 for(i=0;i<this->maxN;i++){
00300 if(this->nGramScanningList[i].vocId==0){
00301 validNgramUpSoFar = false;
00302 }
00303
00304 if(validNgramUpSoFar){
00305 switch(actionType){
00306 case 'C':
00307 freqSoFar = this->nGramScanningList[i].freqSoFar;
00308 if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
00309
00310 this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
00311 }
00312 break;
00313
00314 case 'H':
00315 finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
00316 finalTmpString.appending(C_String(" "));
00317 if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){
00318 cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
00319 }
00320 break;
00321
00322 case 'T':
00323 if(this->nGramScanningList[i].freqSoFar>0){
00324 typeFreq[i]++;
00325 }
00326
00327 tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
00328 break;
00329
00330 default:
00331 cerr<<"Unknown action!\n";
00332 exit(-1);
00333 }
00334 }
00335 }
00336
00337 }