TRF Language Model
trf-ml-train.cpp
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 
18 #include "trf-ml-train.h"
19 #include <omp.h>
20 
21 namespace trf
22 {
23  MLfunc::MLfunc(Model *pModel, CorpusBase *pTrain, CorpusBase *pValid /* = NULL */, CorpusBase *pTest /* = NULL */)
24  {
25  m_pathOutputModel = NULL;
26 
27  Reset(pModel, pTrain, pValid, pTest);
28  }
29  void MLfunc::Reset(Model *pModel, CorpusBase *pTrain, CorpusBase *pValid /* = NULL */, CorpusBase *pTest /* = NULL */)
30  {
31  m_pModel = pModel;
32  m_pCorpusTrain = pTrain;
33  m_pCorpusValid = pValid;
34  m_pCorpusTest = pTest;
35 
39 
41 
42 
44  int nMaxLen = m_pCorpusTrain->GetMaxLen();
45  if (pValid)
46  nMaxLen = max(nMaxLen, pValid->GetMaxLen());
47  if (pTest)
48  nMaxLen = max(nMaxLen, pTest->GetMaxLen());
49 
50  if (m_pModel->GetMaxLen() <= 0) {
51  lout_warning("[MLfunc] Reset: Re-set the model with length=" << nMaxLen);
52  m_pModel->Reset(m_pModel->GetVocab(), nMaxLen);
53  }
54  else if (nMaxLen != m_pModel->m_maxlen) {
55  lout_warning("[MLfunc] Reset: the max-len in training (" << nMaxLen
56  << ") is not equal to m_pModel->m_maxlen (" << m_pModel->m_maxlen<<")");
57  }
58 
60  Array<int> aLenCount;
62  m_trainPi.Fill(0);
63  m_pCorpusTrain->GetLenCount(aLenCount);
64  for (int i = 1; i < aLenCount.GetNum(); i++) {
65  int nLen = min(m_pModel->GetMaxLen(), i);
66  m_trainPi[nLen] += aLenCount[i];
67  }
70 
71  lout_variable(nMaxLen);
73  lout << "train-pi = [ "; lout.output(m_trainPi.GetBuf() + 1, m_trainPi.GetSize() - 1); lout << "]"<< endl;
74 
75 
78  }
79  void MLfunc::SetParam(double *pdParams)
80  {
81  if (pdParams == NULL)
82  return;
83 
85  for (int i = 0; i < m_nParamNum; i++)
86  m_value[i] = (PValue)pdParams[i];
88 
90  }
91  void MLfunc::GetParam(double *pdParams)
92  {
93  if (pdParams == NULL)
94  return;
95 
98 
99  for (int i = 0; i < m_nParamNum; i++)
100  pdParams[i] = m_value[i];
101  }
102  double MLfunc::GetLL(CorpusBase *pCorpus, int nCalNum /* = -1 */, Vec<double> *pLL /* = NULL */)
103  {
104  int nThread = omp_get_max_threads();
105 
107  Vec<double> vSum(nThread);
108  Vec<int> vNum(nThread);
109  vSum.Fill(0);
110  vNum.Fill(0);
111 
112  int nCorpusNum = (nCalNum == -1) ? pCorpus->GetNum() : min(nCalNum, pCorpus->GetNum());
113 
114  if (pLL)
115  pLL->Reset(nCorpusNum);
116 
117  //lout.Progress(0, true, nCorpusNum-1, "[MLfunc] LL:");
118 #pragma omp parallel for firstprivate(aSeq)
119  for (int i = 0; i < nCorpusNum; i++) {
120  pCorpus->GetSeq(i, aSeq);
121 
122  Seq seq;
123  seq.Set(aSeq, m_pModel->m_pVocab);
124  LogP logprob = m_pModel->GetLogProb(seq);
125 
126  vSum[omp_get_thread_num()] += logprob;
127  vNum[omp_get_thread_num()]++;
128  if (pLL)
129  (*pLL)[i] = logprob;
130 
131 // #pragma omp critical
132 // {
133 // lout.Progress();
134 // }
135 
136  }
137 
138  double dsum = 0;
139  int nNum = 0;
140  for (int t = 0; t < nThread; t++) {
141  dsum += vSum[t];
142  nNum += vNum[t];
143  }
144  return dsum / nNum;
145  }
147  {
149  Mat<double> matExp(omp_get_max_threads(), m_nParamNum);
150  matExp.Fill(0);
151 
152  lout.Progress(0, true, pCorpus->GetNum()-1, "[MLfunc] E[f] :");
153 #pragma omp parallel for firstprivate(aSeq)
154  for (int i = 0; i < pCorpus->GetNum(); i++) {
155  pCorpus->GetSeq(i, aSeq);
156 
157  Seq seq;
158  seq.Set(aSeq, m_pModel->m_pVocab);
159  m_pModel->FeatCount(seq, matExp[omp_get_thread_num()].GetBuf());
160 
161 #pragma omp critical
162  {
163  lout.Progress();
164  }
165 
166  }
167 
168  vExp.Reset(m_nParamNum);
169  vExp.Fill(0);
170  for (int t = 0; t < omp_get_max_threads(); t++) {
171  vExp += matExp[t];
172  }
173  vExp /= pCorpus->GetNum();
174  }
176  {
177  //SetParam(pdParams);
178 
179  return -GetLL(m_pCorpusTrain);
180 
181  return 0;
182  }
183  void MLfunc::GetGradient(double *pdGradient)
184  {
185  //SetParam(pdParams);
186  Vec<double> aExpTheoretical(m_nParamNum);
187 
188 
189  m_pModel->GetNodeExp(aExpTheoretical.GetBuf(), m_trainPi.GetBuf());
190 
191  for (int i = 0; i < m_nParamNum; i++) {
192  pdGradient[i] = -(m_vEmpiricalExp[i] - aExpTheoretical[i]);
193  }
194 
195 
196  static File fileDbg("GradientML.dbg", "wt");
197  fileDbg.PrintArray("%f ", m_vEmpiricalExp.GetBuf(), m_nParamNum);
198  fileDbg.PrintArray("%f ", aExpTheoretical.GetBuf(), m_nParamNum);
199  }
200  int MLfunc::GetExtraValues(int t/*, double *pdParams*/, double *pdValues)
201  {
202  //SetParam(pdParams);
203 
204  if ( (t - 1) % 10 == 0) {
206  }
207 
208  int nValue = 0;
209  pdValues[nValue++] = -GetLL(m_pCorpusTrain);
210  if (m_pCorpusValid) pdValues[nValue++] = -GetLL(m_pCorpusValid);
211  if (m_pCorpusTest) pdValues[nValue++] = -GetLL(m_pCorpusTest);
212 
213  return nValue;
214 
215  }
216 }
CorpusBase * m_pCorpusValid
valid corpus
Definition: trf-ml-train.h:37
int GetParamNum() const
Get parameter number.
Definition: trf-model.h:106
const char * m_pathOutputModel
Write to model during iteration.
Definition: trf-ml-train.h:44
void Reset(Vocab *pv, int maxlen)
reset, the maxlen is the length excluding the beg/end symbols.
Definition: trf-model.cpp:28
void GetNodeExp(int nLen, double *pExp)
[exact] E_{p_l}[f]: Exactly calculate the expectation over x and h for length nLen ...
Definition: trf-model.cpp:245
void Fill(T v)
Definition: wb-mat.h:397
void SetPi(Prob *pPi)
Set the pi.
Definition: trf-model.cpp:70
LogP GetLogProb(Seq &seq, bool bNorm=true)
calculate the probability
Definition: trf-model.cpp:74
double PValue
Definition: trf-def.h:26
virtual void SetParam(PValue *pValue)
Set the parameters.
Definition: trf-model.cpp:58
Log & output(T *pArray, int n, const char *pgap=" ")
output an array
Definition: wb-log.h:170
double LogP
Definition: trf-def.h:27
virtual int GetExtraValues(int t, double *pdValues)
calculate extra values which will be print at each iteration
#define lout_variable(x)
Definition: wb-log.h:179
Vec< PValue > m_value
save the temp value of type PValue.
Definition: trf-ml-train.h:34
CorpusBase * m_pCorpusTrain
training corpus
Definition: trf-ml-train.h:36
define a sequence including the word sequence and class sequence
Definition: trf-feature.h:41
virtual bool GetSeq(int nLine, Array< VocabID > &aSeq)=0
get the sequence in nLine
void FeatCount(Seq &seq, double *pCount, double dadd=1.0)
Count the feature number in a sequence.
Definition: trf-model.cpp:106
virtual double GetLL(CorpusBase *pCorpus, int nCalNum=-1, Vec< double > *pLL=NULL)
calculate the log-likelihood on corpus
void Reset(Model *pModel, CorpusBase *pTrain, CorpusBase *pValid=NULL, CorpusBase *pTest=NULL)
int m_maxlen
the maximum length of model, excluding <s> and </s>. The min-len = 1
Definition: trf-model.h:57
Vec< Prob > m_trainPi
the length distribution in training corpus
Definition: trf-ml-train.h:40
TRF model.
Definition: trf-model.h:51
virtual void GetGradient(double *pdGradient)
calculate the gradient g(x)
Vec< double > m_vEmpiricalExp
the empirical expectation
Definition: trf-ml-train.h:42
void Fill(T v)
Definition: wb-mat.h:279
file class.
Definition: wb-file.h:94
int GetSize() const
Definition: wb-mat.h:69
void GetParam(PValue *pValue)
Get the paremetre vector.
Definition: trf-model.cpp:64
void Set(Array< int > &aInt, Vocab *pv)
transform the word sequence (form file) to Seq
Definition: trf-feature.cpp:22
T * GetBuf() const
Definition: wb-mat.h:68
void Progress(long long n=-1, bool bInit=false, long long total=100, const char *head="")
progress bar
Definition: wb-log.cpp:146
Array< VocabID > aSeq
Definition: main-TRF.cpp:153
Vocab * m_pVocab
Definition: trf-model.h:62
#define lout_warning(x)
Definition: wb-log.h:184
Vocab * GetVocab() const
Get Vocab.
Definition: trf-model.h:102
void GetParam(double *pdParams)
int GetNum() const
Get Array number.
Definition: wb-vector.h:240
void Reset(int size=0)
Definition: wb-mat.h:360
Log lout
the defination is in wb-log.cpp
Definition: wb-log.cpp:22
virtual void SetParam(double *pdParams)
set the parameter.
void GetEmpExp(CorpusBase *pCorpus, Vec< double > &vExp)
get the empirical expectation
virtual void GetLenCount(Array< int > &aLenCount)=0
get the length count
CorpusBase * m_pCorpusTest
test corpus
Definition: trf-ml-train.h:38
void PrintArray(const char *pformat, TYPE *pbuf, int num)
print a array into file
Definition: wb-file.h:148
virtual double ExactNormalize(int nLen)
[exact] Exact Normalization, return the logz of given length
Definition: trf-model.cpp:213
int m_nParamNum
the parameter number
Definition: wb-solve.h:45
void WriteT(const char *pfilename)
Write Model.
Definition: trf-model.cpp:158
Definition: trf-alg.cpp:20
Model * m_pModel
HRF model.
Definition: trf-ml-train.h:33
int GetMaxLen() const
Get max-len.
Definition: trf-model.h:100
virtual int GetMaxLen() const
get the max length
Definition: trf-corpus.h:51
virtual int GetNum() const
get the seq number
Definition: trf-corpus.h:47
virtual double GetValue()
calculate the function value f(x)