00001
00006 #include "_SuffixArraySearchApplicationBase.h"
00007 #include <iostream>
00008
00009 using namespace std;
00010
00012
00014
00015 C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
00016 {
00017
00018 this->reportMaxOccurrenceOfOneNgram = -1;
00019 this->highestFreqThresholdForReport = -1;
00020 this->shortestUnitToReport = 1;
00021 this->longestUnitToReport = -1;
00022
00023 this->level1Buckets = NULL;
00024 this->noLevel1Bucket = false;
00025
00026 this->noOffset = false;
00027 }
00028
00029 C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
00030 {
00031
00032 }
00033
00040 void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
00041 {
00042 this->highestFreqThresholdForReport = highestFreqThresholdForReport;
00043 }
00044
00050 void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
00051 {
00052 this->shortestUnitToReport = shortestUnitToReport;
00053 }
00054
00061 void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
00062 {
00063 this->longestUnitToReport = longestUnitToReport;
00064 }
00065
00072 void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
00073 {
00074 this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
00075 }
00076
00077
00078
00084 void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
00085 {
00086
00087 this->loadData(filename, noVoc, noOffset, false);
00088
00089 if(! this->noOffset){
00090 TextLenType lastSentId;
00091 unsigned char tmpOffset;
00092 this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
00093 this->totalSentNum = lastSentId;
00094 }
00095 else{
00096
00097 TextLenType pos = this->corpusSize-3;
00098 while(this->corpus_list[pos]<this->sentIdStart){
00099 pos--;
00100 }
00101
00102 this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
00103 }
00104 cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
00105
00106 }
00107
00108
00113 char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
00114 {
00115
00116 IndexType vocInText = this->corpus_list[posInText+lcp];
00117
00118 if(vocInWord == vocInText){
00119 return 0;
00120 }
00121
00122 if(vocInWord < vocInText){
00123 return 1;
00124 }
00125
00126 return 2;
00127 }
00128
00132 vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
00133 {
00134 vector<C_String> sentAsStringVector;
00135
00136 char tmpToken[MAX_TOKEN_LEN];
00137 memset(tmpToken,0,MAX_TOKEN_LEN);
00138
00139 int pos = 0;
00140
00141 int inputLen = strlen(sentText);
00142
00143 for(int posInInput = 0; posInInput<inputLen; posInInput++){
00144 char thisChar = sentText[posInInput];
00145
00146 if((thisChar==' ')||(thisChar=='\t')){
00147 if(strlen(tmpToken)>0){
00148 tmpToken[pos] = '\0';
00149 sentAsStringVector.push_back(C_String(tmpToken));
00150 pos=0;
00151 tmpToken[pos] = '\0';
00152 }
00153 }
00154 else{
00155 tmpToken[pos] = thisChar;
00156 pos++;
00157 if(pos>=MAX_TOKEN_LEN){
00158 fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
00159 exit(0);
00160 }
00161 }
00162 }
00163
00164 tmpToken[pos] = '\0';
00165 if(strlen(tmpToken)>0){
00166 sentAsStringVector.push_back(C_String(tmpToken));
00167 }
00168
00169 return sentAsStringVector;
00170
00171 }
00172
00176 vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
00177 {
00178 if(this->noVocabulary){
00179 cerr<<"Vocabulary not available!\n";
00180 exit(-1);
00181 }
00182
00183 vector<IndexType> sentAsVocIdVector;
00184
00185 for(int i=0;i<sentAsStringVector.size();i++){
00186 sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));
00187 }
00188 return sentAsVocIdVector;
00189 }
00190
00191
00196 vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
00197 {
00198 vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
00199 return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
00200 }
00201
00202
00212 bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
00213 {
00214 TextLenType leftPos, rightPos, middlePos;
00215
00216
00217
00218
00219 if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
00220
00221 return false;
00222 }
00223
00224 if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
00225
00226 return false;
00227 }
00228
00229
00230
00231
00232
00233
00234 leftPos = rangeStartPos;
00235 rightPos = rangeEndPos;
00236 while( rightPos > (leftPos+1)){
00237
00238 middlePos = (TextLenType)((leftPos + rightPos) / 2);
00239 if(((leftPos + rightPos) % 2) != 0){
00240 middlePos++;
00241 }
00242
00243 if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){
00244
00245 rightPos = middlePos;
00246 }
00247 else{
00248 leftPos = middlePos;
00249 }
00250
00251 }
00252
00253
00254 if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
00255 resultStartPos = leftPos;
00256 }
00257 else{
00258 resultStartPos = rightPos;
00259 }
00260
00261
00262
00263 leftPos = rangeStartPos;
00264 rightPos = rangeEndPos;
00265 while( rightPos > (leftPos+1)){
00266 middlePos = (TextLenType) ((leftPos + rightPos) / 2 );
00267
00268 if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){
00269 leftPos = middlePos;
00270 }
00271 else{
00272 rightPos = middlePos;
00273 }
00274 }
00275
00276
00277 if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
00278 resultEndPos = rightPos;
00279 }
00280 else{
00281 resultEndPos = leftPos;
00282 }
00283
00284 if(resultEndPos>=resultStartPos){
00285 return true;
00286 }
00287
00288 return false;
00289 }
00290
00293 S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
00294 {
00295 vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
00296 sentLen = sentInVocId.size();
00297
00298 return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
00299 }
00300
00301
00313 S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
00314 {
00315 int sentLen = sentInVocId.size();
00316 S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
00317
00318
00319 for(int c=0;c<(sentLen*sentLen);c++){
00320 table[c].found = false;
00321 table[c].startPosInSA = 0;
00322 table[c].endingPosInSA = 0;
00323 }
00324
00325 TextLenType startPos, endPos;
00326
00327
00328 for(int i=0;i<sentLen;i++){
00329 IndexType vocId = sentInVocId[i];
00330
00331 if((vocId==0)||(vocId>=this->sentIdStart)){
00332 table[i].found = false;
00333 }
00334 else{
00335 table[i].startPosInSA = this->level1Buckets[vocId].first;
00336 table[i].endingPosInSA = this->level1Buckets[vocId].last;
00337
00338 if(table[i].startPosInSA<=table[i].endingPosInSA){
00339 table[i].found = true;
00340 }
00341 else{
00342 table[i].found = false;
00343 }
00344 }
00345 }
00346
00347
00348
00349
00350
00351 for(int n=1;n<sentLen;n++){
00352 int levelN_1_0 = (n - 1) * sentLen;
00353 int levelN_0 = n * sentLen;
00354 for(int j=0;j<= (sentLen - 1 - n); j++){
00355
00356
00357 if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
00358 IndexType nextWord = sentInVocId[j+n];
00359
00360
00361 startPos = table[levelN_1_0 + j].startPosInSA;
00362 endPos = table[levelN_1_0 + j].endingPosInSA;
00363
00364 TextLenType foundPosStart = 0;
00365 TextLenType foundPosEnd = 0;
00366
00367
00368
00369
00370 if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){
00371 table[levelN_0 + j].found = true;
00372 table[levelN_0 + j].startPosInSA = foundPosStart;
00373 table[levelN_0 + j].endingPosInSA = foundPosEnd;
00374 }
00375 else{
00376 table[levelN_0 + j].found = false;
00377 }
00378
00379 }
00380 else{
00381 table[levelN_0 + j].found = false;
00382 }
00383 }
00384 }
00385 return table;
00386 }
00387
00388 void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
00389 {
00390 vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
00391 this->displayNgramMatchingFreq4Sent(sentInVocId);
00392 }
00393
00394 void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
00395 {
00396 int sentLen = sentInVocId.size();
00397
00398 int i,j;
00399
00400
00401 S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
00402
00403
00404 cout<<"\t";
00405 for(i=0;i<sentLen;i++){
00406 cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
00407 }
00408 cout<<endl;
00409
00410
00411 i=0;
00412 bool stillMatch = true;
00413 while(stillMatch &&( i<sentLen)){
00414 cout<<i+1<<"\t";
00415 int startForRow = i*sentLen;
00416 bool anyGood = false;
00417 for(j=0;j<= (sentLen - 1 - i); j++){
00418 if(table[startForRow+j].found){
00419
00420 if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){
00421 cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
00422 anyGood = true;
00423 }
00424 else{
00425 cout<<"0";
00426 }
00427
00428 }
00429 else{
00430 cout<<"0";
00431 }
00432 cout<<"\t";
00433 }
00434
00435 stillMatch = anyGood;
00436 cout<<endl;
00437 i++;
00438 }
00439
00440 free(table);
00441 }
00442
00447 void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
00448 {
00449 offset = this->offset_list[pos];
00450 sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
00451
00452 offset--;
00453 }
00454
00455 void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
00456 {
00457 offset = this->offset_list[pos];
00458 sentLen = this->offset_list[pos-offset];
00459 sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
00460
00461 offset--;
00462 }
00463
00464 vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
00465 {
00466 if(srcSentAsVocIDs.size()>255){
00467 cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
00468 exit(0);
00469 }
00470
00471 unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
00472
00473
00474 S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
00475
00476
00477
00478 vector<S_phraseLocationElement> allFoundNgrams;
00479 S_phraseLocationElement tmpNode;
00480
00481 int longestUnitToReportForThisSent = sentLen;
00482 if(this->longestUnitToReport!=-1){
00483
00484 if(this->longestUnitToReport<sentLen){
00485 longestUnitToReportForThisSent = this->longestUnitToReport;
00486 }
00487 }
00488
00489 for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
00490 int firstPosInRow = r*sentLen;
00491 for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
00492 if(table[firstPosInRow + c].found){
00493 tmpNode.posStartInSrcSent = c + 1;
00494 tmpNode.posEndInSrcSent = r + c + 1;
00495
00496
00497 TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
00498 TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
00499
00500 if( (this->highestFreqThresholdForReport <= 0) ||
00501 ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
00502 ){
00503
00504
00505
00506
00507 if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
00508
00509 endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
00510 }
00511
00512 TextLenType sentId;
00513 unsigned char posInSent;
00514 for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
00515 this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
00516 tmpNode.sentIdInCorpus = sentId;
00517 tmpNode.posInSentInCorpus = posInSent;
00518
00519 allFoundNgrams.push_back(tmpNode);
00520 }
00521 }
00522 }
00523
00524 }
00525 }
00526
00527 free(table);
00528
00529 return allFoundNgrams;
00530 }
00531
00532 vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
00533 {
00534
00535 vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
00536
00537 return this->findPhrasesInASentence(srcSentAsVocIDs);
00538 }
00539
00540
00541 bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
00542 {
00543 int phraseLen = phrase.size();
00544
00545
00546 for(int i=0;i<phrase.size();i++){
00547 if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
00548 return false;
00549 }
00550 }
00551
00552 TextLenType currentRangeStart, currentRangeEnd;
00553 TextLenType narrowedRangeStart, narrowedRangeEnd;
00554 IndexType vocId;
00555
00556
00557 vocId = phrase[0];
00558 currentRangeStart = this->level1Buckets[vocId].first;
00559 currentRangeEnd = this->level1Buckets[vocId].last;
00560
00561 if(currentRangeStart>currentRangeEnd){
00562 return false;
00563 }
00564
00565 int posInPhrase = 1;
00566 while( posInPhrase<phraseLen ){
00567 vocId = phrase[posInPhrase];
00568 bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
00569
00570 if(! stillExist){
00571 return false;
00572 }
00573
00574 currentRangeStart = narrowedRangeStart;
00575 currentRangeEnd = narrowedRangeEnd;
00576
00577 posInPhrase++;
00578 }
00579
00580
00581 rangeStart = currentRangeStart;
00582 rangeEnd = currentRangeEnd;
00583
00584 return true;
00585 }
00586
00594 vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
00595 {
00596 vector<S_SimplePhraseLocationElement> matchingResult;
00597
00598 TextLenType rangeStart, rangeEnd;
00599
00600 if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
00601
00602 S_SimplePhraseLocationElement tmpNode;
00603 for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
00604 this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
00605 matchingResult.push_back(tmpNode);
00606 }
00607 }
00608
00609 return matchingResult;
00610 }
00611
00612 vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
00613 {
00614
00615 vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
00616
00617 return this->locateExactPhraseInCorpus(phraseAsVocIDs);
00618 }
00619
00620
00621 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
00622 {
00623 TextLenType rangeStart, rangeEnd;
00624
00625 if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
00626 return rangeEnd - rangeStart + 1;
00627 }
00628
00629 return 0;
00630 }
00631
00632 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
00633 {
00634
00635 vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
00636
00637 return this->freqOfExactPhraseMatch(phraseAsVocIDs);
00638 }
00639
00640
00641 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
00642 {
00643 TextLenType rangeStart, rangeEnd;
00644
00645 sentLen = phrase.size();
00646
00647 if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
00648 startPosInSA = rangeStart;
00649 return rangeEnd - rangeStart + 1;
00650 }
00651
00652 return 0;
00653 }
00654
00655 TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
00656 {
00657
00658 vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
00659
00660 return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
00661 }
00662
00663
00664 TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
00665 {
00666 return this->totalSentNum;
00667 }
00668
00671 void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
00672 {
00673 n = index / sentLen + 1;
00674 posInSrcSent = index % sentLen;
00675 }
00676
00680 unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
00681 {
00682 unsigned int indexInTable = (n-1)*sentLen + posInSent;
00683
00684 return indexInTable;
00685 }
00686
00688 unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
00689 {
00690 vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
00691 return this->numberOfMatcedNgram(sentInVocId);
00692 }
00693
00695 unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
00696 {
00697 int sentLen = sentInVocId.size();
00698
00699 S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
00700
00701 unsigned int totalMatched = 0;
00702
00703 for(unsigned int i=0;i<(sentLen*sentLen);i++){
00704 if(table[i].found){
00705 totalMatched++;
00706 }
00707 }
00708
00709 free(table);
00710 return totalMatched;
00711 }
00712
00713
00714 map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
00715 {
00716 vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
00717 return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
00718 }
00719
00720 map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
00721 {
00722 sentLen = sentInVocId.size();
00723 map<int, pair<int, unsigned long> > nGramMatched;
00724 map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
00725
00726
00727 S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
00728
00729 for(int n = 1; n <= sentLen; n++){
00730 for(int startPos=0; startPos <= (sentLen - n); startPos++){
00731 int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
00732
00733 if(table[indexInTable].found){
00734
00735 unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
00736 iterNGramMatched = nGramMatched.find(n);
00737 if(iterNGramMatched==nGramMatched.end()){
00738 nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
00739 }
00740 else{
00741 iterNGramMatched->second.first++;
00742 iterNGramMatched->second.second+=freqInTraining;
00743 }
00744 }
00745 }
00746 }
00747
00748 free(table);
00749
00750 return nGramMatched;
00751 }
00752