00001 #if !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
00002 #define __SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_
00003
00004 #include "_SuffixArrayApplicationBase.h"
00010 typedef struct simplePhraseLocationElement
00011 {
00012 TextLenType sentIdInCorpus;
00013 unsigned char posInSentInCorpus;
00014 }S_SimplePhraseLocationElement;
00015
00022 typedef struct phraseLocationElement
00023 {
00024 unsigned char posStartInSrcSent;
00025 unsigned char posEndInSrcSent;
00026 TextLenType sentIdInCorpus;
00027 unsigned char posInSentInCorpus;
00028 }S_phraseLocationElement;
00029
00033 typedef struct phraseLocationWithSrcSentElement
00034 {
00035 int srcPosStart;
00036 int srcPosEnd;
00037 TextLenType sentId;
00038 TextLenType posInSent;
00039 vector<C_String> sentence;
00040 }S_phraseLocationWithSrcSentElement;
00041
00045 typedef struct sentSearchTableElement
00046 {
00047 bool found;
00048 TextLenType startPosInSA;
00049 TextLenType endingPosInSA;
00050 }S_sentSearchTableElement;
00051
00052
00062 class C_SuffixArraySearchApplicationBase : public C_SuffixArrayApplicationBase
00063 {
00064 public:
00065 void loadData_forSearch(const char * filename, bool noVoc, bool noOffset);
00066
00067 unsigned int numberOfMatcedNgram(const char * srcSent);
00068 unsigned int numberOfMatcedNgram(vector<IndexType> & sentInVocId);
00069
00070 TextLenType freqOfExactPhraseMatch(const char * phrase);
00071 TextLenType freqOfExactPhraseMatch(vector<IndexType> & phrase);
00072
00073 TextLenType freqOfExactPhraseMatchAndFirstOccurrence(const char * phrase, TextLenType & startPosInSA, int & sentLen);
00074 TextLenType freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen);
00075
00076 vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(const char * phrase);
00077 vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(vector<IndexType> & phrase);
00078
00079 vector<S_phraseLocationElement> findPhrasesInASentence(const char * srcSent);
00080 vector<S_phraseLocationElement> findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs);
00081
00082 void displayNgramMatchingFreq4Sent(const char *);
00083 void displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId);
00084
00085 map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen);
00086 map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int & sentLen);
00087
00088 S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen);
00089 S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId);
00090
00091 void setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram);
00092 void setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport);
00093 void setParam_longestUnitToReport(int longestUnitToReport);
00094 void setParam_shortestUnitToReport(int shortestUnitToReport);
00095
00096 TextLenType returnTotalSentNumber();
00097
00098 vector<IndexType> convertStringToVocId(const char * sentText);
00099 vector<C_String> convertCharStringToCStringVector(const char * sentText);
00100 vector<IndexType> convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector);
00101
00102
00103 C_SuffixArraySearchApplicationBase();
00104 virtual ~C_SuffixArraySearchApplicationBase();
00105
00106 protected:
00107 bool locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd);
00108
00109 bool searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType & resultStartPos, TextLenType & resultEndPos);
00110 char comparePhraseWithTextWithLCP(IndexType, int, TextLenType);
00111
00112 void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset);
00113 void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen);
00114
00115
00116 unsigned int twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen);
00117 void oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n);
00118
00119 int reportMaxOccurrenceOfOneNgram;
00120 int highestFreqThresholdForReport;
00121 int longestUnitToReport;
00122 int shortestUnitToReport;
00123
00124 TextLenType totalSentNum;
00125 };
00126
00127 #endif // !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)