_SuffixArraySearchApplicationBase.h

Go to the documentation of this file.
00001 #if !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
00002 #define __SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_
00003 
00004 #include "_SuffixArrayApplicationBase.h"
00010 typedef struct simplePhraseLocationElement
00011 {
00012         TextLenType sentIdInCorpus;
00013         unsigned char posInSentInCorpus;
00014 }S_SimplePhraseLocationElement;
00015 
00022 typedef struct phraseLocationElement
00023 {
00024         unsigned char posStartInSrcSent;
00025         unsigned char posEndInSrcSent;
00026         TextLenType sentIdInCorpus;
00027         unsigned char posInSentInCorpus;
00028 }S_phraseLocationElement;
00029 
00033 typedef struct phraseLocationWithSrcSentElement
00034 {
00035         int srcPosStart;
00036         int srcPosEnd;  
00037         TextLenType sentId;
00038         TextLenType posInSent;
00039         vector<C_String> sentence;
00040 }S_phraseLocationWithSrcSentElement;
00041 
00045 typedef struct sentSearchTableElement
00046 {
00047         bool found;
00048         TextLenType startPosInSA;
00049         TextLenType endingPosInSA;
00050 }S_sentSearchTableElement;
00051 
00052 
00062 class C_SuffixArraySearchApplicationBase : public C_SuffixArrayApplicationBase  
00063 {
00064 public:
00065         void loadData_forSearch(const char * filename, bool noVoc, bool noOffset);
00066 
00067         unsigned int numberOfMatcedNgram(const char * srcSent);
00068         unsigned int numberOfMatcedNgram(vector<IndexType> & sentInVocId);
00069 
00070         TextLenType freqOfExactPhraseMatch(const char * phrase);
00071         TextLenType freqOfExactPhraseMatch(vector<IndexType> & phrase);
00072 
00073         TextLenType freqOfExactPhraseMatchAndFirstOccurrence(const char * phrase, TextLenType & startPosInSA, int & sentLen);
00074         TextLenType freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen);
00075 
00076         vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(const char * phrase);
00077         vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(vector<IndexType> & phrase);
00078 
00079         vector<S_phraseLocationElement> findPhrasesInASentence(const char * srcSent);
00080         vector<S_phraseLocationElement> findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs);
00081 
00082         void displayNgramMatchingFreq4Sent(const char *);
00083         void displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId);
00084 
00085         map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen);
00086         map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int & sentLen);
00087 
00088         S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen);
00089         S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId);
00090 
00091         void setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram);
00092         void setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport);
00093         void setParam_longestUnitToReport(int longestUnitToReport);
00094         void setParam_shortestUnitToReport(int shortestUnitToReport);
00095 
00096         TextLenType returnTotalSentNumber();
00097 
00098         vector<IndexType> convertStringToVocId(const char * sentText);
00099         vector<C_String> convertCharStringToCStringVector(const char * sentText);
00100         vector<IndexType> convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector);
00101 
00102 
00103         C_SuffixArraySearchApplicationBase();
00104         virtual ~C_SuffixArraySearchApplicationBase();
00105 
00106 protected:
00107         bool locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd);
00108 
00109         bool searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType & resultStartPos, TextLenType & resultEndPos);
00110         char comparePhraseWithTextWithLCP(IndexType, int, TextLenType);
00111 
00112         void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset);
00113         void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen);
00114 
00115         
00116         unsigned int twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen);
00117         void oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n);
00118 
00119         int reportMaxOccurrenceOfOneNgram;
00120         int highestFreqThresholdForReport;
00121         int longestUnitToReport;
00122         int shortestUnitToReport;
00123 
00124         TextLenType totalSentNum;
00125 };
00126 
00127 #endif // !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)

Generated on Fri Jul 6 23:11:08 2007 for SALM by  doxygen 1.5.1