00001
00002
00003
00004 #if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
00005 #define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
00006
00007
00008 #include "_SuffixArraySearchApplicationBase.h"
00009 #include "salm_shared.h"
00010
00014 typedef unsigned int LMState;
00015
00016
00020 typedef struct s_cachedLmInfo{
00021 int nextState;
00022 double logProb;
00023 }S_CachedLmInfo;
00024
00028 typedef struct s_NgramLocationInCorpus{
00029 TextLenType posInCorpus;
00030 unsigned char len;
00031 }S_NgramLocationInCorpus;
00032
00036 typedef struct s_lmStateInfo{
00037 S_NgramLocationInCorpus locationInCorpus;
00038 map<IndexType, S_CachedLmInfo> cachedNextWordExtension;
00039 }S_LMStateInfo;
00040
00044 struct lt_ngramLocationInCorpus
00045 {
00046 bool operator()(S_NgramLocationInCorpus a, S_NgramLocationInCorpus b) const{
00047 if(a.posInCorpus<b.posInCorpus){
00048 return true;
00049 }
00050
00051 if(a.posInCorpus>b.posInCorpus){
00052 return false;
00053 }
00054
00055 if(a.len<b.len){
00056 return true;
00057 }
00058
00059 return false;
00060 }
00061 };
00062
00063
00072 class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
00073 {
00074
00075 public:
00076 IndexType returnVocId(C_String aWord);
00077
00079 LMState beginOfSentenceState();
00080
00082 double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
00083
00085 double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);
00086
00088 double logProbEnd(LMState lmState);
00089
00091 void setParam_interpolationStrategy(char interpolationStrategy);
00092
00093
00094 C_SuffixArrayLanguageModel(const char * cfgFileName);
00095 C_SuffixArrayLanguageModel();
00096 ~C_SuffixArrayLanguageModel();
00097
00098
00099 private:
00100
00101 void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
00102
00103
00104 double logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
00105 double calcLogProb(double *freq);
00106 double calcLogProb_equalWeightedInterpolation(double *freq);
00107 double calcLogProb_ibmHeuristicInterpolation(double *freq);
00108 double calcLogProb_maxProbInterpolation(double * freq);
00109
00110 char interpolationStrategy;
00111 int maxN;
00112 IndexType vocIdForSentStart;
00113 IndexType vocIdForSentEnd;
00114 IndexType vocIdForCorpusEnd;
00115
00117 void constructDiscountingMap();
00118 double *discountingMap;
00119 double discountFreq(int n, unsigned int observedFreq);
00120 bool applyDiscounting;
00121 int maxFreqForDiscounting;
00122 S_nGramScanningInfoElement * nGramScanningList;
00123
00124
00126 void resetLmStates();
00127 void initialLmState();
00128
00129
00130 vector<S_LMStateInfo> allLMStates;
00131 map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus> ngramLocation2LmStateId;
00132
00133
00134
00135 };
00136
00137 #endif