_SuffixArrayLanguageModel.h

Go to the documentation of this file.
00001 // Revision $Rev: 3824 $
00002 // Last Modified $LastChangedDate: 2007-07-07 00:51:51 -0400 (Sat, 07 Jul 2007) $
00003 
00004 #if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
00005 #define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
00006 
00007 
00008 #include "_SuffixArraySearchApplicationBase.h"
00009 #include "salm_shared.h"
00010 
00014 typedef unsigned int LMState;
00015 
00016 
00020 typedef struct s_cachedLmInfo{
00021         int nextState;
00022         double logProb;
00023 }S_CachedLmInfo;
00024 
00028 typedef struct s_NgramLocationInCorpus{
00029         TextLenType posInCorpus;
00030         unsigned char len;
00031 }S_NgramLocationInCorpus;
00032 
00036 typedef struct s_lmStateInfo{
00037         S_NgramLocationInCorpus locationInCorpus;
00038         map<IndexType, S_CachedLmInfo> cachedNextWordExtension; //cached information of this LMState extended by the next word
00039 }S_LMStateInfo;
00040 
00044 struct lt_ngramLocationInCorpus
00045 {
00046   bool operator()(S_NgramLocationInCorpus a, S_NgramLocationInCorpus b) const{
00047                 if(a.posInCorpus<b.posInCorpus){
00048                         return true;
00049                 }
00050 
00051                 if(a.posInCorpus>b.posInCorpus){
00052                         return false;
00053                 }
00054 
00055                 if(a.len<b.len){
00056                         return true;
00057                 }
00058 
00059                 return false;   
00060         }
00061 };
00062 
00063 
00072 class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
00073 {
00074 
00075 public:
00076         IndexType returnVocId(C_String aWord);  
00077 
00079         LMState beginOfSentenceState();
00080         
00082         double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
00083         
00085         double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);     
00086         
00088         double logProbEnd(LMState lmState);     
00089 
00091         void setParam_interpolationStrategy(char interpolationStrategy);
00092 
00093 
00094         C_SuffixArrayLanguageModel(const char * cfgFileName);
00095         C_SuffixArrayLanguageModel();
00096         ~C_SuffixArrayLanguageModel();
00097 
00098 
00099 private:
00100 
00101         void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
00102 
00103         //Log prob calculation
00104         double logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
00105         double calcLogProb(double *freq);
00106         double calcLogProb_equalWeightedInterpolation(double *freq);
00107         double calcLogProb_ibmHeuristicInterpolation(double *freq);
00108         double calcLogProb_maxProbInterpolation(double * freq);
00109 
00110         char interpolationStrategy;
00111         int maxN;
00112         IndexType vocIdForSentStart;
00113         IndexType vocIdForSentEnd;
00114         IndexType vocIdForCorpusEnd;
00115 
00117         void constructDiscountingMap();
00118         double *discountingMap;
00119         double discountFreq(int n, unsigned int observedFreq);
00120         bool applyDiscounting;
00121         int maxFreqForDiscounting;
00122         S_nGramScanningInfoElement * nGramScanningList; 
00123         
00124 
00126         void resetLmStates();
00127         void initialLmState();  
00128         
00129         //caching lm prob for each sentence     
00130         vector<S_LMStateInfo> allLMStates;
00131         map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus> ngramLocation2LmStateId;
00132 
00133 
00134 
00135 };
00136 
00137 #endif

Generated on Fri Jul 6 23:11:07 2007 for SALM by  doxygen 1.5.1