25 return m_pModel->
ClusterSum(m_seq, nPos, nOrder);
32 m_maxSampleLen = (int)(1.02 * maxlen);
40 m_pi.Reset(m_maxlen + 1);
41 m_logz.Reset(m_maxlen + 1);
42 m_zeta.Reset(m_maxlen + 1);
48 m_matLenJump.Reset(m_maxSampleLen + 1, m_maxSampleLen + 1);
50 for (
int i = 1; i < m_matLenJump.GetRow(); i++) {
51 for (
int j = max(1, i - 1); j <= min(m_matLenJump.GetCol()-1, i + 1); j++) {
52 m_matLenJump[i][j] = 1;
54 m_matLenJump[i][i] = 0;
55 LineNormalize(m_matLenJump[i].GetBuf(), m_matLenJump.GetCol());
61 memcpy(m_value.GetBuf(), pValue,
sizeof(pValue[0])*GetParamNum());
67 memcpy(pValue, m_value.GetBuf(),
sizeof(pValue[0])*GetParamNum());
82 m_pFeat->
Find(afeat, seq);
85 for (
int i = 0; i < afeat.
GetNum(); i++) {
86 logSum += m_value[afeat[i]];
90 int nLen = min(m_maxlen, seq.
GetLen());
91 logSum = logSum - m_logz[nLen] +
Prob2LogP(m_pi[nLen]);
98 m_pFeat =
new Feat(nOrder, m_pVocab->GetClassNum() > 0);
100 m_pFeat->
Reset(pfeatstyle);
101 m_pFeat->LoadFeatFromCorpus(pcorpus, m_pVocab);
102 m_value.Reset(m_pFeat->GetNum());
109 m_pFeat->
Find(afeat, seq);
110 for (
int i = 0; i < afeat.
GetNum(); i++) {
111 pCount[afeat[i]] += dadd;
116 File fout(pfilename,
"rt");
118 lout <<
"[Model]: Read(txt) from " << pfilename << endl;
121 fout.
Scanf(
"m_vocabsize=%d\n", &nVocabSize);
122 fout.
Scanf(
"m_maxlen=%d\n", &m_maxlen);
125 Reset(m_pVocab, m_maxlen);
126 if (m_pVocab->GetSize() != nVocabSize) {
127 lout_error(
"[Model] ReadT: the input nVocabSize(" << nVocabSize <<
") != m_pVocab->GetSize(" << m_pVocab->GetSize() <<
")");
131 fout.
Scanf(
"m_pi=[ ");
132 for (
int i = 1; i <= m_maxlen; i++) {
133 fout.
Scanf(
"%lf ", &dValue);
137 fout.
Scanf(
"m_logz=[ ");
138 for (
int i = 1; i <= m_maxlen; i++) {
139 fout.
Scanf(
"%lf ", &dValue);
143 fout.
Scanf(
"m_zeta=[ ");
144 for (
int i = 1; i <= m_maxlen; i++) {
145 fout.
Scanf(
"%lf ", &dValue);
151 fout.
Scanf(
"featnum=%d\n", &nValue);
152 m_value.Reset(nValue);
156 m_pFeat->ReadT(fout, m_value.GetBuf());
160 File fout(pfilename,
"wt");
161 lout <<
"[Model] Write(txt) to " << pfilename << endl;
163 fout.
Print(
"m_vocabsize=%d\n", m_pVocab->GetSize());
164 fout.
Print(
"m_maxlen=%d\n", m_maxlen);
165 fout.
Print(
"m_pi=[ ");
166 for (
int i = 1; i <= m_maxlen; i++) {
167 fout.
Print(
"%f ", m_pi[i]);
170 fout.
Print(
"m_logz=[ ");
171 for (
int i = 1; i <= m_maxlen; i++) {
172 fout.
Print(
"%f ", m_logz[i]);
175 fout.
Print(
"m_zeta=[ ");
176 for (
int i = 1; i <= m_maxlen; i++) {
177 fout.
Print(
"%f ", m_zeta[i]);
181 fout.
Print(
"featnum=%d\n", m_pFeat->GetNum());
182 m_pFeat->WriteT(fout, m_value.GetBuf());
192 int nWordFeatOrder = min(nOrder, GetMaxOrder());
194 for (
int n = 1; n <= nWordFeatOrder; n++) {
195 m_pFeat->Find(afeat, seq, nPos, n);
199 if (nPos == nLen - nOrder) {
200 for (
int i = nPos + 1; i < nLen; i++) {
201 nWordFeatOrder = min(nLen - i, GetMaxOrder());
202 for (
int n = 1; n <= nWordFeatOrder; n++) {
203 m_pFeat->Find(afeat, seq, i, n);
208 for (
int i = 0; i < afeat.
GetNum(); i++)
209 LogSum += m_value[afeat[i]];
215 int nMaxOrder = GetMaxOrder();
219 if (nLen <= nMaxOrder) {
222 SeqIter.
AddAllLine(0, m_pVocab->GetSize() - 1);
223 while (SeqIter.Next()) {
225 double d = GetLogProb(seq,
false);
230 m_AlgNode.ForwardBackward(nLen, nMaxOrder, m_pVocab->GetSize());
231 logZ = m_AlgNode.GetLogSummation();
239 for (
int len = 1; len <= m_maxlen; len++) {
241 m_zeta[len] = m_logz[len] - m_logz[1];
247 memset(pExp, 0,
sizeof(pExp[0])*GetParamNum());
249 int nMaxOrder = GetMaxOrder();
251 if (nLen <= nMaxOrder) {
254 SeqIter.
AddAllLine(0, m_pVocab->GetSize() - 1);
255 while (SeqIter.Next()) {
259 m_pFeat->
Find(afeat, seq);
260 for (
int i = 0; i < afeat.
GetNum(); i++) {
261 pExp[afeat[i]] += prob;
266 int nClusterNum = nLen - nMaxOrder + 1;
268 for (
int pos = 0; pos < nClusterNum; pos++) {
272 SeqIter.
AddAllLine(0, m_pVocab->GetSize() - 1);
273 while (SeqIter.Next()) {
277 for (
int n = 1; n <= nMaxOrder; n++)
278 m_pFeat->
Find(afeat, seq, pos, n);
279 for (
int i = 0; i < afeat.
GetNum(); i++) {
280 pExp[afeat[i]] += prob;
286 if (pos == nClusterNum - 1) {
288 for (
int ii = 1; ii < nMaxOrder; ii++) {
289 for (
int n = 1; n <= nMaxOrder - ii; n++) {
290 m_pFeat->Find(afeat, seq, pos + ii, n);
293 for (
int i = 0; i < afeat.
GetNum(); i++) {
294 pExp[afeat[i]] += prob;
303 if (pLenProb == NULL)
304 pLenProb = m_pi.GetBuf();
309 for (
int len = 1; len <= m_maxlen; len++) {
311 int nMaxOrder = GetMaxOrder();
312 m_AlgNode.ForwardBackward(len, nMaxOrder, m_pVocab->GetSize());
314 GetNodeExp(len, expTemp.
GetBuf());
316 for (
int i = 0; i < exp.
GetSize(); i++) {
317 exp[i] += pLenProb[len] * expTemp[i];
329 int nOldLen = seq.
GetLen();
331 LogP j1 = ProposeLength(nOldLen, nNewLen,
true);
332 LogP j2 = ProposeLength(nNewLen, nOldLen,
false);
334 if (nNewLen == nOldLen)
338 if (nNewLen == nOldLen + 1) {
339 LogP logpold = GetLogProb(seq);
341 LogP R = ProposeC0(seq.
x[
class_layer][nNewLen - 1], seq, nNewLen - 1,
true);
342 LogP G = SampleX(seq, nNewLen - 1);
343 LogP logpnew = GetLogProb(seq);
345 logpAcc = (j2 - j1) + logpnew - (logpold + R + G);
347 else if (nNewLen == nOldLen - 1) {
348 LogP logpold = GetLogProb(seq);
349 LogP R = ProposeC0(seq.
x[
class_layer][nOldLen - 1], seq, nOldLen - 1,
false);
350 LogP G = SampleX(seq, nOldLen - 1,
false);
353 LogP logpnew = GetLogProb(seq);
355 logpAcc = (j2 - j1) + logpnew + R + G - logpold;
357 else if (nNewLen != nOldLen){
358 lout_error(
"[Model] Sample: nNewLen(" << nNewLen <<
") and nOldLen(" << nOldLen <<
")");
364 m_nLenJumpAccTimes++;
369 m_nLenJumpTotalTime++;
375 for (
int nPos = 0; nPos < seq.
GetLen(); nPos++) {
384 nNew =
LineSampling(m_matLenJump[nOld].GetBuf(), m_matLenJump[nOld].GetSize());
387 return Prob2LogP(m_matLenJump[nOld][nNew]);
392 if (m_pVocab->GetClassNum() == 0) {
397 Vec<LogP> vlogps(m_pVocab->GetClassNum());
398 ProposeCProbs(vlogps, seq, nPos);
412 for (
int cid = 0; cid < m_pVocab->GetClassNum(); cid++) {
414 logps[cid] = GetReducedModelForC(seq, nPos);
427 m_pFeat->FindPosDep(afeat, seq, nPos, 1);
428 for (
int i = 0; i < afeat.
GetNum(); i++) {
429 logSum += m_value[afeat[i]];
438 m_pFeat->FindPosDep(afeat, seq, nPos, 2);
439 for (
int i = 0; i < afeat.
GetNum(); i++) {
440 logSum += m_value[afeat[i]];
449 m_pFeat->FindPosDep(afeat, seq, nPos, 0);
450 for (
int i = 0; i < afeat.
GetNum(); i++) {
451 logSum += m_value[afeat[i]];
463 for (
int i = 0; i < pXs->
GetNum(); i++) {
467 resLogp =
Log_Sum(resLogp, GetReducedModel(seq, nPos));
476 if (m_pVocab->GetClassNum() == 0) {
482 Vec<LogP> vlogps_c(m_pVocab->GetClassNum());
483 ProposeCProbs(vlogps_c, seq, nPos);
486 LogP logpRi = vlogps_c[ci];
487 LogP logpR0 = vlogps_c[C0];
492 LogP Logp_ci = GetMarginalProbOfC(seq, nPos);
494 LogP Logp_C0 = GetMarginalProbOfC(seq, nPos);
496 LogP acclogp = logpRi + Logp_C0 - (logpR0 + Logp_ci);
498 m_nSampleHTotalTimes++;
500 m_nSampleHAccTimes++;
514 if (nPos >= seq.
GetLen()) {
515 lout_error(
"[Model] SampleH: the nPos(" << nPos <<
") > the length of sequence(" << seq.
GetLen() <<
")");
522 for (
int i = 0; i < pXs->GetNum(); i++) {
527 aLogps[i] = GetReducedModelForW(seq, nPos);
538 idx = pXs->Find(nSaveX);
541 lout_error(
"Can't find the VocabID(" << nSaveX <<
") in the array.\n" 542 <<
"This may beacuse word(" << nSaveX <<
") doesnot belongs to class(" 552 int nParamsNum = GetParamNum();
557 this->GetParam(vParamsP0.
GetBuf());
563 LogP logz_pn = nLen * log((
double)m_pVocab->GetSize());
570 Model *pInterModel =
new Model(m_pVocab, m_maxlen);
571 pInterModel->m_pFeat = m_pFeat;
572 pInterModel->m_value.Reset(GetParamNum());
576 aLogWeight.
SetNum(nChain);
579 int localChainNum = 0;
581 for (
int k = 0; k < nChain; k++) {
589 pInterModel->SetParam(vParamsPn.
GetBuf());
591 LogP logp_old = pInterModel->GetLogProb(seq,
false) - logz_pn;
594 for (
int t = nInter - 1; t >= 0; t--) {
597 for (
int i = 0; i < nParamsNum; i++)
598 pParamsCur[i] = pP0[i] * (1 - beta) + pPn[i] * beta;
599 pInterModel->SetParam(pParamsCur);
602 LogP rate = pInterModel->GetLogProb(seq,
false) - logp_old;
607 pInterModel->MarkovMove(seq);
608 logp_old = pInterModel->GetLogProb(seq,
false);
611 aLogWeight[k] = log_w;
615 localLogSum =
Log_Sum(localLogSum, log_w);
616 localChainNum = localChainNum + 1;
617 LogP localLogz = localLogSum - log(localChainNum);
618 lout << localLogz <<
"(" << localChainNum <<
") ";
622 pInterModel->m_pFeat = NULL;
633 lout <<
"AIS norm len form " << nLenMin <<
" to " << nLenMax << endl;
634 int nParamsNum = GetParamNum();
639 this->GetParam(vParamsP0.
GetBuf());
643 lout <<
"AISNorm: Using all the unigram." << endl;
647 Vec<LogP> aWordLogp(m_pVocab->GetSize());
648 for (
int w = 0; w < m_pVocab->GetSize(); w++) {
652 m_pFeat->
Find(afind, seq, 1, 1);
655 for (
int i = 0; i < afind.
GetNum(); i++) {
656 vParamsPn[afind[i]] = vParamsP0[afind[i]];
657 dvalue += vParamsP0[afind[i]];
659 aWordLogp[w] = dvalue;
663 LogP logsum =
Log_Sum(aWordLogp.GetBuf(), aWordLogp.GetSize());
665 for (
int i = 0; i <= m_maxlen; i++) {
666 alogz_pn[i] = i * logsum;
676 Model *pInterModel =
new Model(m_pVocab, m_maxlen);
677 pInterModel->
m_pFeat = m_pFeat;
682 Mat<LogP> matLogWeight(m_maxlen+1, nChain);
683 Mat<LogP> matLogPOld(m_maxlen+1, nChain);
685 matLogWeight.
Fill(0);
688 for (
int i = 1; i <= m_maxlen; i++) {
689 for (
int j = 0; j < nChain; j++) {
690 matSeq[i][j] =
new Seq(i);
693 Seq *pSeq = matSeq[i][j];
694 for (
int nPos = 0; nPos < pSeq->
GetLen(); nPos++) {
699 matLogPOld[i][j] = pInterModel->
GetLogProb(*matSeq[i][j],
false) - alogz_pn[i];
706 for (
int t = nInter - 1; t >= 0; t--) {
713 for (
int i = 0; i < nParamsNum; i++)
714 pParamsCur[i] = pP0[i] * (1 - beta) + pPn[i] * beta;
717 #pragma omp parallel for 718 for (
int nLen = nLenMin; nLen <= nLenMax; nLen++) {
719 for (
int k = 0; k < nChain; k++) {
721 LogP rate = pInterModel->
GetLogProb(*matSeq[nLen][k],
false) - matLogPOld[nLen][k];
722 matLogWeight[nLen][k] += rate;
727 matLogPOld[nLen][k] = pInterModel->
GetLogProb(*matSeq[nLen][k],
false);
736 for (
int i = 1; i <= m_maxlen; i++) {
737 for (
int j = 0; j < nChain; j++) {
742 for (
int nLen = nLenMin; nLen <= nLenMax; nLen++) {
743 LogP logz =
Log_Sum(matLogWeight[nLen].GetBuf(), matLogWeight[nLen].GetSize()) -
Prob2LogP(nChain);
745 lout <<
"logz[" << nLen <<
"] = " << logz <<
" logw= ";
746 for (
int i = 0; i < matLogWeight[nLen].
GetSize(); i++) {
747 lout << matLogWeight[nLen][i] <<
" ";
void Sample(Seq &seq)
[sample] Perform one train-dimensional mixture sampling
T & Get(unsigned int i, unsigned int j)
#define SAFE_DELETE(p)
memory release
virtual void MarkovMove(Seq &seq)
[sample] Markov Move - perform the gibbs sampling
T & Get(int i)
get the value at position i
LogP ProposeLength(int nOld, int &nNew, bool bSample)
[sample] Propose the length, using the variable m_matLenJump
void ReadT(const char *pfilename)
Read Model.
int LogLineSampling(const LogP *pdProbs, int nNum)
virtual void ExactNormalize()
[exact] Exact Normalization
void SetClass(Vocab *pv)
set the class based the word sequence
void Random(Vocab *pv)
Random.
void Reset(Vocab *pv, int maxlen)
reset, the maxlen is the length excluding the beg/end symbols.
void GetNodeExp(int nLen, double *pExp)
[exact] E_{p_l}[f]: Exactly calculate the expectation over x and h for length nLen ...
LogP SampleX(Seq &seq, int nPos, bool bSample=true)
[sample] Sample the x_i at position nPos
LogP ProposeC0(VocabID &ci, Seq &seq, int nPos, bool bSample)
[sample] Propose the c_{i} at position i. Then return the propose probability R(c_i|h_i,c_{other})
void LineNormalize(Prob *pdProbs, int nNum)
void SetPi(Prob *pPi)
Set the pi.
LogP Log_Sum(LogP x, LogP y)
LogP GetLogProb(Seq &seq, bool bNorm=true)
calculate the probability
LogP GetReducedModel(Seq &seq, int nPos)
[sample] A unnormalized reduced depending on nPos.
void LoadFromCorpus(const char *pcorpus, const char *pfeatstyle, int nOrder)
load ngram features from corpus
virtual void SetParam(PValue *pValue)
Set the parameters.
LogP GetReducedModelForC(Seq &seq, int nPos)
[sample] A unnormalized reduced model to sample class c_i.
virtual int Scanf(const char *p_pMessage,...)
scanf
void Reset(int p_len)
reset only change the len variable, does not change the buffer size.
T * GetBuffer(int i=0) const
get the buffer pointer
virtual void Print(const char *p_pMessage,...)
print
virtual LogP ClusterSum(int *pSeq, int nLen, int nPos, int nOrder)
This function need be derived. Calcualte the log probability of each cluster.
define a sequence including the word sequence and class sequence
void FeatCount(Seq &seq, double *pCount, double dadd=1.0)
Count the feature number in a sequence.
int m_nTotalNum
total feature number
int LineSampling(const Prob *pdProbs, int nNum)
int Find(T t)
Find a value and return the position.
void GetParam(PValue *pValue)
Get the paremetre vector.
void Set(Array< int > &aInt, Vocab *pv)
transform the word sequence (form file) to Seq
LogP GetMarginalProbOfC(Seq &seq, int nPos)
[sample] given c_i, summate the probabilities of x_i, i.e. P(c_i)
void SetNum(int n)
Set Array number, to melloc enough memory.
void AddAllLine(T beg, T end, T step=1)
void Progress(long long n=-1, bool bInit=false, long long total=100, const char *head="")
progress bar
bool Acceptable(Prob prob)
Feat * m_pFeat
hash all the features
void SampleC(Seq &seq, int nPos)
[sample] Sample the c_i at position nPos without x_i.
void ProposeCProbs(VecShell< LogP > &logps, Seq &seq, int nPos)
[sample] Return the propose distribution of c_i at position nPos
void Clean()
Clean the array. Just set the top of array to -1 and donot release the memory.
LogP ClusterSum(Seq &seq, int nPos, int nOrder)
Read Binary.
void Reset(int nOrder, bool bClass)
Reset, set the order. Node: the maximum order (including the skip) may be larger than nOrder...
int GetNum() const
Get Array number.
VocabID * GetClassSeq()
get class sequence
LogP LogLineNormalize(LogP *pdProbs, int nNum)
double GetAISFactor(int t, int T)
Get the AIS intermediate factor beta_t.
VocabID * GetWordSeq()
get word sequence
Log lout
the defination is in wb-log.cpp
include all the feature table
void WriteT(const char *pfilename)
Write Model.
Vec< PValue > m_value
the value for each features
LogP GetReducedModelForW(Seq &seq, int nPos)
[sample] A unnormalized reduced model to sample word w_i.
LogP AISNormalize(int nLen, int nChain, int nInter)
perform AIS to calculate the normalization constants, return the logz of given length ...
void LocalJump(Seq &seq)
[sample] Local Jump - sample a new length