00001
00006 #include "_SuffixArrayLanguageModel.h"
00007 #include <iostream>
00008 #include <fstream>
00009 #include "math.h"
00010
00011 using namespace std;
00012
00013
00014 C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
00015 {
00016
00017 }
00018
00019 C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
00020 {
00021
00022 }
00023
00024
00047 C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
00048 {
00049
00050 fstream cfgFile;
00051 cfgFile.open(cfgFileName,ios::in);
00052
00053 if(!cfgFile){
00054 fprintf(stderr,"Configuration file does not exist! quit!!\n");
00055 exit(0);
00056 }
00057
00058
00059
00060 char paraName[1024];
00061 char corpusFileNameStem[1024];
00062 corpusFileNameStem[0]=0;
00063 this->maxFreqForDiscounting=-1;
00064
00065 this->interpolationStrategy = 'e';
00066 this->maxN = 5;
00067
00068 while(!cfgFile.eof()){
00069 cfgFile>>paraName;
00070
00071 if(strcmp(paraName,"CORPUS")==0){
00072 cfgFile>>corpusFileNameStem;
00073 }
00074 else if(strcmp(paraName,"N")==0){
00075 cfgFile>>this->maxN;
00076 }
00077 else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
00078 cfgFile>>maxFreqForDiscounting;
00079 }
00080 else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
00081 cfgFile>>this->interpolationStrategy;
00082 }
00083
00084 paraName[0]=0;
00085
00086 }
00087
00088
00089 if(strlen(corpusFileNameStem)==0){
00090 cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
00091 exit(-1);
00092 }
00093 this->loadData_forSearch(corpusFileNameStem, false, true);
00094
00095
00096
00097 if(this->maxFreqForDiscounting<=0){
00098 this->applyDiscounting = false;
00099 }
00100 else{
00101 this->applyDiscounting = true;
00102 this->constructDiscountingMap();
00103 }
00104
00105
00106 this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
00107
00108 if(this->vocIdForSentEnd==0){
00109 cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
00110 exit(0);
00111 }
00112
00113 this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
00114 if(this->vocIdForSentStart==0){
00115 cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
00116 exit(0);
00117 }
00118
00119 this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
00120 if(this->vocIdForCorpusEnd==0){
00121 cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
00122 exit(0);
00123 }
00124
00125 this->interpolationStrategy = 'e';
00126
00127 }
00128
00129
00135 void C_SuffixArrayLanguageModel::constructDiscountingMap()
00136 {
00137 int i,j;
00138 unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
00139
00140 if(countOfCountsTable==NULL){
00141 cerr<<"Count of counts table can not be initialized. Exit\n";
00142 exit(0);
00143 }
00144
00145
00146 for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
00147 countOfCountsTable[c]=0;
00148 }
00149
00150
00151 S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
00152 for(i=0;i<this->maxN;i++){
00153 nGramScanningList[i].freqSoFar=0;
00154 nGramScanningList[i].vocId = 0;
00155 nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;
00156 }
00157
00158 bool stillMeaningful = true;
00159 TextLenType saPos=0;
00160
00161 while(stillMeaningful && ( saPos<this->corpusSize ) ){
00162
00163 TextLenType posInCorpus = this->suffix_list[saPos];
00164 IndexType wordInCorpus = this->corpus_list[posInCorpus];
00165
00166 if(wordInCorpus<this->sentIdStart){
00167
00168 if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){
00169
00170 bool quit =false;
00171 i=0;
00172
00173 while(!quit && (i<this->maxN)){
00174 wordInCorpus = this->corpus_list[posInCorpus+i];
00175 if(
00176 (wordInCorpus<this->sentIdStart)&&
00177 (wordInCorpus!=this->vocIdForSentEnd)&&
00178 (wordInCorpus!=this->vocIdForSentStart)&&
00179 (wordInCorpus==nGramScanningList[i].vocId)){
00180
00181 nGramScanningList[i].freqSoFar++;
00182 }
00183 else{
00184
00185 bool validNgramUpSoFar = true;
00186 unsigned int freqSoFar;
00187
00188 for(j=i;j<this->maxN;j++){
00189
00190
00191 if(nGramScanningList[j].vocId==0){
00192 validNgramUpSoFar = false;
00193 }
00194
00195 if(validNgramUpSoFar){
00196
00197 freqSoFar = nGramScanningList[j].freqSoFar;
00198 if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
00199
00200 countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
00201 }
00202 }
00203
00204
00205 if((posInCorpus+j)<this->corpusSize){
00206 wordInCorpus = this->corpus_list[posInCorpus+j];
00207 }
00208 else{
00209 wordInCorpus = 0;
00210 }
00211
00212 if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
00213 wordInCorpus=0;
00214 nGramScanningList[j].freqSoFar = 0;
00215 }
00216 else{
00217 nGramScanningList[j].freqSoFar = 1;
00218 }
00219
00220 nGramScanningList[j].vocId = wordInCorpus;
00221 }
00222
00223 quit=true;
00224 }
00225
00226 i++;
00227 }
00228 }
00229 }
00230 else{
00231 stillMeaningful = false;
00232 }
00233
00234 saPos++;
00235 }
00236
00237
00238 bool validNgramUpSoFar = true;
00239 unsigned int freqSoFar;
00240 for(i=0;i<this->maxN;i++){
00241 if(nGramScanningList[i].vocId==0){
00242 validNgramUpSoFar = false;
00243 }
00244
00245 if(validNgramUpSoFar){
00246
00247 freqSoFar = nGramScanningList[i].freqSoFar;
00248 if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
00249
00250 countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
00251 }
00252 }
00253 }
00254
00255
00256
00257 this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
00258
00259 for(i=0;i<this->maxN;i++){
00260
00261
00262 unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
00263 double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
00264
00265 for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){
00266
00267 if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){
00268 discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
00269 }
00270 else{
00271 discountingMapForThisN[freq] = -1;
00272 }
00273 }
00274
00275 discountingMapForThisN[this->maxFreqForDiscounting-1] = -1;
00276 }
00277
00278
00279 free(countOfCountsTable);
00280
00281 }
00282
00291 void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
00292 {
00293 vector<IndexType> nGram;
00294
00295 if(currentMatchStart!=(TextLenType) -1){
00296 if(currentMatchLen==this->maxN){
00297 currentMatchStart++;
00298 currentMatchLen--;
00299 }
00300
00301 for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
00302 nGram.push_back(this->corpus_list[pos]);
00303 }
00304 }
00305
00306 nGram.push_back(nextWord);
00307
00308 int sentLen = nGram.size();
00309
00310
00311 S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
00312
00313 int startPosForNgram;
00314 int startPosForLongestMatchingWithNextWord;
00315 int cellIndexForLongestMatchingWithNextWord;
00316
00317 bool stillMatched = true;
00318 bool atLeastOneMatched = false;
00319
00320 int indexForNgram;
00321
00322 unsigned int totalOccurrences;
00323 unsigned int totalOccurrencesOfHistory;
00324
00325
00326 indexForNgram = sentLen - 1;
00327 if(table[indexForNgram].found){
00328 totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
00329 if(this->applyDiscounting){
00330 freqTable[0] = this->discountFreq(1, totalOccurrences);
00331 }
00332 else{
00333 freqTable[0] = totalOccurrences;
00334 }
00335
00336 freqTable[1] = this->corpusSize;
00337 cellIndexForLongestMatchingWithNextWord = indexForNgram;
00338 startPosForLongestMatchingWithNextWord = sentLen-1;
00339 atLeastOneMatched = true;
00340 }
00341 else{
00342 stillMatched = false;
00343 }
00344
00345 int n=2;
00346 startPosForNgram = sentLen - 2;
00347 while((stillMatched)&&(startPosForNgram>=0)){
00348
00349 indexForNgram = (n-1) * sentLen + startPosForNgram;
00350 int indexForHistory = (n-2) * sentLen + startPosForNgram;
00351
00352 if(table[indexForNgram].found){
00353
00354 totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
00355 totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
00356
00357
00358 if(this->applyDiscounting){
00359 freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);
00360 }
00361 else{
00362 freqTable[2*n-2] = (double)totalOccurrences;
00363 }
00364
00365 freqTable[2*n-1] = (double) totalOccurrencesOfHistory;
00366
00367 if(n<this->maxN){
00368 cellIndexForLongestMatchingWithNextWord = indexForNgram;
00369 startPosForLongestMatchingWithNextWord = startPosForNgram;
00370 }
00371 }
00372 else{
00373 stillMatched = false;
00374 }
00375
00376 startPosForNgram--;
00377 n++;
00378 }
00379
00380 if(atLeastOneMatched){
00381 updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
00382 updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
00383 }
00384 else{
00385 updatedMatchingStart = (TextLenType) -1;
00386 updatedMatchingLen = 0;
00387 }
00388
00389 free(table);
00390
00391 }
00392
00393
00394
00395 double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
00396 {
00397 if(n>=this->maxN){
00398 return (double) observedFreq;
00399 }
00400
00401 if(observedFreq>=(this->maxFreqForDiscounting-1)){
00402 return (double) observedFreq;
00403 }
00404
00405
00406 double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
00407
00408 if(discountedFreq>0){
00409 return discountedFreq;
00410 }
00411
00412
00413 return (double) observedFreq;
00414 }
00415
00416
00418 LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
00419 {
00420
00421 this->resetLmStates();
00422 this->initialLmState();
00423
00424 return 0;
00425 }
00426
00427 void C_SuffixArrayLanguageModel::initialLmState()
00428 {
00429
00430 S_LMStateInfo sentStartNode;
00431 sentStartNode.locationInCorpus.posInCorpus = 1;
00432 sentStartNode.locationInCorpus.len = 1;
00433 sentStartNode.cachedNextWordExtension.clear();
00434
00435 this->allLMStates.push_back(sentStartNode);
00436 this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
00437 }
00438
00439 void C_SuffixArrayLanguageModel::resetLmStates()
00440 {
00441 this->allLMStates.clear();
00442 this->ngramLocation2LmStateId.clear();
00443 }
00444
00445
00454 double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
00455 {
00456 if(lmState>=this->allLMStates.size()){
00457 cerr<<"Invalid LM State: "<<lmState<<endl;
00458 exit(-1);
00459 }
00460
00461
00462 map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
00463 iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
00464
00465 if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){
00466
00467
00468 S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
00469 S_NgramLocationInCorpus updatedNgramLocation;
00470
00471 double logProb = this->logProbFromFreq(
00472 correspondingNgramLocation.posInCorpus,
00473 correspondingNgramLocation.len,
00474 nextWord,
00475 updatedNgramLocation.posInCorpus,
00476 updatedNgramLocation.len);
00477
00478
00479 int updatedLmStateId;
00480 map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
00481 iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
00482 if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){
00483 S_LMStateInfo newLmStateNode;
00484
00485 newLmStateNode.locationInCorpus = updatedNgramLocation;
00486 newLmStateNode.cachedNextWordExtension.clear();
00487
00488 this->allLMStates.push_back(newLmStateNode);
00489 updatedLmStateId = this->allLMStates.size() -1 ;
00490 this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
00491 }
00492 else{
00493 updatedLmStateId = iterNgramLocation2LmStateId->second;
00494 }
00495
00496
00497 S_CachedLmInfo cachedLmInfo;
00498 cachedLmInfo.logProb = logProb;
00499 cachedLmInfo.nextState = updatedLmStateId;
00500
00501 this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
00502
00503
00504 nextState = updatedLmStateId;
00505
00506 return logProb;
00507 }
00508
00509 nextState = iterNextWordExtensionCache->second.nextState;
00510
00511 return iterNextWordExtensionCache->second.logProb;
00512 }
00513
00514
00524 double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
00525 {
00526 double logProb = 0;
00527
00528 if (phrase.size() == 0) {
00529 nextState = lmState;
00530 return logProb;
00531 }
00532
00533 for(int i=0;i<phrase.size();i++){
00534 logProb+=this->logProb(lmState, phrase[i], nextState);
00535 lmState = nextState;
00536 }
00537
00538 return logProb;
00539 }
00540
00544 double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
00545 {
00546 LMState dummyNextState;
00547 return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
00548 }
00549
00559 double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
00560 {
00561
00562 double logProb;
00563
00564 double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
00565 memset(freqTable, 0, 2*this->maxN*sizeof(double));
00566
00567 this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
00568
00569 logProb = this->calcLogProb(freqTable);
00570
00571 free(freqTable);
00572
00573 return logProb;
00574
00575 }
00576
00577 double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
00578 {
00579 switch(this->interpolationStrategy){
00580 case 'e':
00581 return this->calcLogProb_equalWeightedInterpolation(freq);
00582 break;
00583 case 'i':
00584 return this->calcLogProb_ibmHeuristicInterpolation(freq);
00585 break;
00586 case 'm':
00587 return this->calcLogProb_maxProbInterpolation(freq);
00588 break;
00589 default:
00590 cerr<<"Unknown interpolation strategy!\n";
00591 exit(0);
00592 }
00593 }
00594
00595 double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
00596 {
00597 double prob = 0.0;
00598
00599
00600 if(freq[0]>0){
00601
00602 int i=0;
00603 bool stillMatched = true;
00604
00605 while(stillMatched && (i<this->maxN)){
00606 if(freq[2*i]>0){
00607 prob+=freq[2*i]/freq[2*i+1];
00608 }
00609 else{
00610 stillMatched = false;
00611 }
00612
00613 i++;
00614 }
00615
00616 return log(prob/(double)this->maxN);
00617 }
00618 else{
00619 return SALM_LOG_PROB_UNK;
00620 }
00621 }
00622
00623 double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
00624 {
00625 double prob = 0.0;
00626 if(freq[0]==0){
00627 return SALM_LOG_PROB_UNK;
00628 }
00629
00630 double remainingWeightSum = 1.0;
00631
00632
00633 int i = this->maxN - 1;
00634
00635 while(freq[2*i]==0){
00636 i--;
00637 }
00638
00639 for(int j=i;j>=0;j--){
00640
00641 double historyFreq = freq[2*j+1];
00642 double logHistoryFreq = log(historyFreq);
00643 if(logHistoryFreq>1){
00644 logHistoryFreq = 1.0;
00645 }
00646
00647 double reliability = 0.1*logHistoryFreq+0.3;
00648 double adjustedWeights = remainingWeightSum * reliability;
00649
00650 prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
00651
00652 remainingWeightSum -= adjustedWeights;
00653 }
00654
00655 return log(prob);
00656 }
00657
00658 double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
00659 {
00660 double maxProb = 0.0;
00661
00662 if(freq[0]>0){
00663
00664 int i=0;
00665 bool stillMatched = true;
00666
00667 while(stillMatched && (i<this->maxN)){
00668 if(freq[2*i]>0){
00669 double prob=freq[2*i]/freq[2*i+1];
00670
00671 if(prob>maxProb){
00672 maxProb = prob;
00673 }
00674 }
00675 else{
00676 stillMatched = false;
00677 }
00678
00679 i++;
00680 }
00681
00682 return log(maxProb);
00683 }
00684 else{
00685 return SALM_LOG_PROB_UNK;
00686 }
00687 }
00688
00689 IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
00690 {
00691 return this->voc->returnId(aWord);
00692 }