_SuffixArraySearchApplicationBase.cpp

Go to the documentation of this file.
00001 
00006 #include "_SuffixArraySearchApplicationBase.h"
00007 #include <iostream>
00008 
00009 using namespace std;
00010 
00012 // Construction/Destruction
00014 
00015 C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
00016 {
00017 
00018     this->reportMaxOccurrenceOfOneNgram = -1;    
00019         this->highestFreqThresholdForReport = -1;
00020         this->shortestUnitToReport = 1;
00021     this->longestUnitToReport = -1; //no constraint
00022 
00023     this->level1Buckets = NULL;
00024         this->noLevel1Bucket = false;   //by default, build level1 bucket
00025 
00026     this->noOffset = false; //by default, load offset    
00027 }
00028 
00029 C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
00030 {
00031 
00032 }
00033 
00040 void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
00041 {
00042     this->highestFreqThresholdForReport = highestFreqThresholdForReport;
00043 }
00044 
00050 void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
00051 {
00052     this->shortestUnitToReport = shortestUnitToReport;
00053 }
00054 
00061 void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
00062 {
00063     this->longestUnitToReport = longestUnitToReport;
00064 }
00065 
00072 void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
00073 {
00074     this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
00075 }
00076 
00077 
00078 
00084 void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
00085 {
00086 
00087         this->loadData(filename, noVoc, noOffset, false);       //call the constructor of the super class, load data and build level1Bucket
00088 
00089         if(! this->noOffset){
00090         TextLenType lastSentId;
00091         unsigned char tmpOffset;
00092         this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
00093         this->totalSentNum = lastSentId;
00094     }
00095     else{
00096         //we do not have offset information, simply travel to the sentence head
00097         TextLenType pos = this->corpusSize-3;
00098         while(this->corpus_list[pos]<this->sentIdStart){    //still actual words
00099             pos--;
00100         }
00101         //at this position, it should be the <sentId> for the last sentence
00102         this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
00103     }
00104     cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
00105 
00106 }
00107 
00108 
00113 char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
00114 {   
00115 
00116     IndexType vocInText = this->corpus_list[posInText+lcp];
00117 
00118     if(vocInWord == vocInText){
00119         return 0;
00120     }
00121     
00122     if(vocInWord < vocInText){
00123         return 1;
00124     }
00125 
00126     return 2;
00127 }
00128 
00132 vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
00133 {
00134         vector<C_String> sentAsStringVector;
00135 
00136         char tmpToken[MAX_TOKEN_LEN];
00137     memset(tmpToken,0,MAX_TOKEN_LEN);
00138 
00139     int pos = 0;
00140 
00141     int inputLen = strlen(sentText);
00142 
00143         for(int posInInput = 0; posInInput<inputLen; posInInput++){
00144         char thisChar = sentText[posInInput];
00145 
00146         if((thisChar==' ')||(thisChar=='\t')){  //delimiters
00147             if(strlen(tmpToken)>0){
00148                 tmpToken[pos] = '\0';               
00149                 sentAsStringVector.push_back(C_String(tmpToken));
00150                 pos=0;
00151                 tmpToken[pos] = '\0';
00152             }
00153         }
00154         else{
00155             tmpToken[pos] = thisChar;
00156             pos++;
00157             if(pos>=MAX_TOKEN_LEN){ //we can handle it
00158                 fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
00159                 exit(0);
00160             }
00161         }
00162     }
00163 
00164     tmpToken[pos] = '\0';
00165     if(strlen(tmpToken)>0){     
00166         sentAsStringVector.push_back(C_String(tmpToken));
00167     }
00168 
00169         return sentAsStringVector;
00170 
00171 }
00172 
00176 vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
00177 {
00178         if(this->noVocabulary){
00179         cerr<<"Vocabulary not available!\n";
00180         exit(-1);
00181     }
00182 
00183         vector<IndexType> sentAsVocIdVector;
00184 
00185         for(int i=0;i<sentAsStringVector.size();i++){
00186                 sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));        
00187         }
00188         return sentAsVocIdVector;
00189 }
00190 
00191 
00196 vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
00197 {
00198         vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
00199         return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
00200 }
00201 
00202 
00212 bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
00213 {
00214     TextLenType leftPos, rightPos, middlePos;
00215 
00216     //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
00217     //e.g. range correspondes to [ab, ad], but we are searching for (aa)
00218     //so first step is to make sure the lcp+next word is still in this range
00219     if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
00220         //phrase+next word < text corresponding rangeStart, we could not find it inside this range
00221         return false;
00222     }
00223 
00224     if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
00225         //phrase+next word > text corresponding to rangeEnd
00226         return false;
00227     }
00228     
00229     //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
00230 
00231 
00232     //search for left bound ( the pos in text which is the min(text>=w))
00233     //at any time, Left<w<=Right (actually Left<=w<=Right)
00234     leftPos = rangeStartPos;
00235     rightPos = rangeEndPos; 
00236     while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
00237 
00238         middlePos = (TextLenType)((leftPos + rightPos) / 2);
00239         if(((leftPos + rightPos) % 2) != 0){            
00240             middlePos++; //bias towards right
00241         }
00242 
00243         if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){ 
00244             // phrase <= middlePos in Text, go left
00245             rightPos = middlePos;
00246         }
00247         else{
00248             leftPos = middlePos;    //word > middle, go right
00249         }
00250 
00251     }
00252     //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
00253     //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
00254     if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
00255         resultStartPos = leftPos;
00256     }
00257     else{
00258         resultStartPos = rightPos;
00259     }
00260 
00261     //search for right bound ( the value which is the max(text<=w))
00262     //at any time, Left<w<=Right (actually Left<=w<=Right)
00263     leftPos = rangeStartPos;
00264     rightPos = rangeEndPos;         
00265     while( rightPos > (leftPos+1)){ //stop when right = left + 1
00266         middlePos = (TextLenType) ((leftPos + rightPos) / 2 );  //bias towards left
00267         
00268         if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
00269             leftPos = middlePos;
00270         }
00271         else{
00272             rightPos = middlePos;   // ==1, phrase < middlePos
00273         }
00274     }
00275     //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
00276     //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
00277     if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
00278         resultEndPos = rightPos;
00279     }
00280     else{
00281         resultEndPos = leftPos;
00282     }
00283 
00284     if(resultEndPos>=resultStartPos){
00285         return true;
00286     }
00287 
00288     return false;   //could not find this phrase
00289 }
00290 
00293 S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
00294 {
00295         vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
00296         sentLen = sentInVocId.size();
00297 
00298         return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
00299 }
00300 
00301 
00313 S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
00314 {
00315     int sentLen = sentInVocId.size();
00316     S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
00317     
00318     //for consistency, initialize all cells
00319     for(int c=0;c<(sentLen*sentLen);c++){
00320         table[c].found = false;
00321         table[c].startPosInSA = 0;
00322         table[c].endingPosInSA = 0;
00323     }
00324     
00325     TextLenType startPos, endPos;
00326 
00327     //initialize word level elements
00328     for(int i=0;i<sentLen;i++){
00329         IndexType vocId = sentInVocId[i];
00330         //cout<<vocId<<" ";
00331         if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
00332             table[i].found = false;
00333         }
00334         else{
00335             table[i].startPosInSA = this->level1Buckets[vocId].first;
00336             table[i].endingPosInSA = this->level1Buckets[vocId].last;
00337 
00338             if(table[i].startPosInSA<=table[i].endingPosInSA){
00339                 table[i].found = true;
00340             }
00341             else{   //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
00342                 table[i].found = false;
00343             }
00344         }
00345     }
00346     
00347 
00348     //filling in the cells in the table row by row
00349     //basically this means we start by looking for smaller units first
00350     //if they are found, search for longer n-grams
00351     for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
00352         int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
00353         int levelN_0 = n * sentLen;
00354         for(int j=0;j<= (sentLen - 1 - n); j++){    //possible starting point for n+1 gram
00355             //necessary conditions that this n+1 gram exist are:
00356             //the two sub n-gram all exist in the corpus            
00357             if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
00358                 IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram                              
00359 
00360                 //n+1 gram has to be in the range of the n-gram in SA
00361                 startPos = table[levelN_1_0 + j].startPosInSA;
00362                 endPos = table[levelN_1_0 + j].endingPosInSA;
00363 
00364                 TextLenType foundPosStart = 0;
00365                 TextLenType foundPosEnd = 0;
00366 
00367                 //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
00368                 //prefix of the n words in the proposed n+1 gram, no need to compare
00369                 //only need to compare the n+1 word, which is "nextWord" here
00370                 if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){                 
00371                     table[levelN_0 + j].found = true;
00372                     table[levelN_0 + j].startPosInSA =  foundPosStart;
00373                     table[levelN_0 + j].endingPosInSA = foundPosEnd;
00374                 }
00375                 else{
00376                     table[levelN_0 + j].found = false;
00377                 }
00378 
00379             }
00380             else{
00381                 table[levelN_0 + j].found = false;
00382             }
00383         }
00384     }
00385     return table;
00386 }
00387 
00388 void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
00389 {
00390     vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
00391     this->displayNgramMatchingFreq4Sent(sentInVocId);
00392 }
00393 
00394 void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
00395 {
00396     int sentLen = sentInVocId.size();
00397     
00398     int i,j;
00399 
00400     //construct the n-gram search table    
00401     S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
00402   
00403     //show sentence
00404     cout<<"\t";
00405     for(i=0;i<sentLen;i++){
00406         cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
00407     }
00408     cout<<endl;
00409 
00410     //show frequency of each n-gram
00411     i=0;
00412     bool stillMatch = true;
00413     while(stillMatch &&( i<sentLen)){
00414         cout<<i+1<<"\t";
00415         int startForRow = i*sentLen;
00416         bool anyGood = false;
00417         for(j=0;j<= (sentLen - 1 - i); j++){
00418             if(table[startForRow+j].found){
00419                 //this is for regular case              
00420                 if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){  //more than one occurrence
00421                     cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
00422                     anyGood = true;
00423                 }
00424                 else{
00425                     cout<<"0";
00426                 }
00427     
00428             }
00429             else{
00430                 cout<<"0";
00431             }
00432             cout<<"\t";
00433         }
00434 
00435         stillMatch = anyGood;
00436         cout<<endl;
00437         i++;
00438     }
00439     
00440     free(table);
00441 }
00442 
00447 void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
00448 {
00449     offset = this->offset_list[pos];
00450     sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
00451 
00452     offset--;   //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
00453 }
00454 
00455 void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
00456 {
00457     offset = this->offset_list[pos];
00458     sentLen = this->offset_list[pos-offset];
00459     sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
00460 
00461     offset--;   //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
00462 }
00463 
00464 vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
00465 {
00466     if(srcSentAsVocIDs.size()>255){
00467         cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
00468         exit(0);
00469     }
00470 
00471     unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
00472 
00473     //construct the n-gram search table 
00474     S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
00475 
00476     //Now, we know all the n-grams we are looking for
00477     //output the results
00478     vector<S_phraseLocationElement> allFoundNgrams;
00479     S_phraseLocationElement tmpNode;    
00480 
00481     int longestUnitToReportForThisSent = sentLen;
00482     if(this->longestUnitToReport!=-1){
00483         //and if longestUnitToReport is shorter than sentLen
00484         if(this->longestUnitToReport<sentLen){
00485             longestUnitToReportForThisSent = this->longestUnitToReport;
00486         }
00487     }
00488     
00489     for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
00490         int firstPosInRow = r*sentLen;
00491         for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
00492             if(table[firstPosInRow + c].found){ //at this position the ngram was found
00493                 tmpNode.posStartInSrcSent = c + 1;  //position starts from 1
00494                 tmpNode.posEndInSrcSent = r + c + 1;
00495 
00496                 //now for all ocurrences, find their sentId and realative positions
00497                 TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
00498                 TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
00499                 
00500                 if( (this->highestFreqThresholdForReport <= 0) ||    //no limit
00501                     ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
00502                 ){  
00503                     // we don't want to retrieve high-freq n-gram which is very time consuming
00504                     //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
00505                                         
00506 
00507                     if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
00508                         //and for each n-gram, report only a limited amount of occurrences
00509                         endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
00510                     }
00511 
00512                     TextLenType sentId;
00513                     unsigned char posInSent;
00514                     for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
00515                         this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
00516                         tmpNode.sentIdInCorpus = sentId;
00517                         tmpNode.posInSentInCorpus = posInSent;
00518 
00519                         allFoundNgrams.push_back(tmpNode);
00520                     }
00521                 }
00522             }
00523 
00524         }
00525     }
00526     
00527     free(table);
00528 
00529     return allFoundNgrams;  
00530 }
00531 
00532 vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
00533 {
00534     //use the vocabulary associated with this corpus to convert words to vocIDs
00535     vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
00536 
00537     return this->findPhrasesInASentence(srcSentAsVocIDs);
00538 }
00539 
00540 
00541 bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
00542 {
00543     int phraseLen = phrase.size();
00544 
00545     //first check if there are any <unk> in the phrase
00546     for(int i=0;i<phrase.size();i++){
00547         if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
00548             return false;   //return empty matching result
00549         }
00550     }
00551 
00552     TextLenType currentRangeStart, currentRangeEnd;
00553     TextLenType narrowedRangeStart, narrowedRangeEnd;
00554     IndexType vocId;
00555 
00556     //for word 1
00557     vocId = phrase[0];
00558     currentRangeStart = this->level1Buckets[vocId].first;
00559     currentRangeEnd = this->level1Buckets[vocId].last;
00560 
00561     if(currentRangeStart>currentRangeEnd){
00562         return false;   //even this 1-gram does not exist
00563     }
00564 
00565     int posInPhrase = 1;    
00566     while( posInPhrase<phraseLen ){
00567         vocId = phrase[posInPhrase];
00568         bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
00569 
00570         if(! stillExist){
00571             return false;
00572         }
00573         
00574         currentRangeStart = narrowedRangeStart;
00575         currentRangeEnd = narrowedRangeEnd;
00576 
00577         posInPhrase++;
00578     }
00579 
00580     //we find the range of matching phrase, now get the sentId
00581     rangeStart = currentRangeStart;
00582     rangeEnd = currentRangeEnd;
00583 
00584     return true;
00585 }
00586 
00594 vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
00595 {
00596     vector<S_SimplePhraseLocationElement> matchingResult;
00597 
00598     TextLenType rangeStart, rangeEnd;
00599 
00600     if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
00601         //we find some match
00602         S_SimplePhraseLocationElement tmpNode;
00603         for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
00604             this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
00605             matchingResult.push_back(tmpNode);
00606         }
00607     }
00608 
00609     return matchingResult;
00610 }
00611 
00612 vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
00613 {
00614     //use the vocabulary associated with this corpus to convert words to vocIds
00615     vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
00616 
00617     return this->locateExactPhraseInCorpus(phraseAsVocIDs);
00618 }
00619 
00620 
00621 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
00622 {
00623     TextLenType rangeStart, rangeEnd;
00624 
00625     if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
00626         return rangeEnd - rangeStart + 1;
00627     }
00628 
00629     return 0;
00630 }
00631 
00632 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
00633 {
00634     //use the vocabulary associated with this corpus to convert words to vocIds
00635     vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
00636 
00637     return this->freqOfExactPhraseMatch(phraseAsVocIDs);
00638 }
00639 
00640 
00641 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
00642 {
00643     TextLenType rangeStart, rangeEnd;
00644         
00645         sentLen = phrase.size();
00646 
00647     if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
00648                 startPosInSA = rangeStart;
00649         return rangeEnd - rangeStart + 1;
00650     }
00651 
00652     return 0;
00653 }
00654 
00655 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
00656 {
00657     //use the vocabulary associated with this corpus to convert words to vocIds
00658     vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
00659 
00660     return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
00661 }
00662 
00663 
00664 TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
00665 {
00666     return this->totalSentNum;
00667 }
00668 
00671 void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
00672 {
00673     n = index / sentLen + 1;
00674     posInSrcSent = index % sentLen;
00675 }
00676 
00680 unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
00681 {
00682     unsigned int indexInTable = (n-1)*sentLen + posInSent;
00683 
00684     return indexInTable;
00685 }
00686 
00688 unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
00689 {   
00690     vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
00691     return this->numberOfMatcedNgram(sentInVocId);
00692 }
00693 
00695 unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
00696 {       
00697     int sentLen = sentInVocId.size();
00698 
00699     S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
00700 
00701     unsigned int totalMatched = 0;
00702 
00703     for(unsigned int i=0;i<(sentLen*sentLen);i++){      
00704         if(table[i].found){
00705             totalMatched++;
00706         }           
00707     }
00708 
00709     free(table);
00710     return totalMatched;
00711 }
00712 
00713 
00714 map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
00715 {
00716         vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
00717         return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
00718 }
00719 
00720 map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
00721 {
00722         sentLen = sentInVocId.size();
00723         map<int, pair<int, unsigned long> > nGramMatched;
00724         map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
00725 
00726         //construct the n-gram search table
00727         S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
00728   
00729         for(int n = 1; n <= sentLen; n++){              
00730                 for(int startPos=0; startPos <= (sentLen - n); startPos++){
00731                         int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
00732 
00733                         if(table[indexInTable].found){
00734                                 
00735                                 unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
00736                                 iterNGramMatched = nGramMatched.find(n);
00737                                 if(iterNGramMatched==nGramMatched.end()){//has not seen this before
00738                                         nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
00739                                 }
00740                                 else{
00741                                         iterNGramMatched->second.first++;
00742                                         iterNGramMatched->second.second+=freqInTraining;
00743                                 }
00744                         }
00745                 }
00746         }
00747         
00748         free(table);
00749   
00750         return nGramMatched;
00751 }
00752 

Generated on Fri Jul 6 23:11:08 2007 for SALM by  doxygen 1.5.1