50 " trf -vocab [vocab] -read [model] -write [output model] -norm-method [Exact/AIS]\n" 51 "Calculate log-likelihood:\n" 52 " trf -vocab [vocab] -read [model] -test [txt-id-file]\n" 53 "language model rescoring:\n" 54 " trf -vocab [vocab] -read [model] -nbest [nbest list] -lmscore [output lmscore]\n" 55 "Revise the length distribution pi:\n" 56 " trf -vocab [vocab] -read [model] -write [output moddel] -len-file [a txt-id-file used to summary pi]\n" 59 #define lout_exe lout<<"[TRF] " 90 opt.
Parse(_argc, _argv);
92 lout <<
"*********************************************" << endl;
93 lout <<
" TRF.exe " << endl;
94 lout <<
"\t" << __DATE__ <<
"\t" << __TIME__ <<
"\t" << endl;
95 lout <<
"**********************************************" << endl;
98 lout <<
"[OMP] omp_thread = " << omp_get_max_threads() << endl;
123 lout_exe <<
"-LL = " << -dLL << endl;
124 lout_exe <<
"PPL = " << dPPL << endl;
155 #pragma omp parallel for firstprivate(aSeq) 156 for (
int i = 0; i < nCorpusNum; i++) {
163 aLL[omp_get_thread_num()] += logprob;
165 aSents[omp_get_thread_num()] += 1;
176 if (pPPL) *pPPL = exp(-dLL * nSent / (nSent + nWord));
182 for (
int i = 0; i < aStrs.
GetNum(); i++) {
188 pvid = vocabhash.
Find(
"<UNK>");
204 for (
int i = 0; i < pV->
GetSize(); i++) {
207 lout_exe <<
"Find words with same name but different id! (str=" 208 << pV->
GetWordStr(i) <<
" id=" << i <<
")" << endl;
215 lout_exe <<
"Rescoring: " << pathTest <<
" ..." << endl;
219 File file(pathTest,
"rt");
221 while (pLine = file.
GetLine(
true)) {
222 String curLabel = strtok(pLine,
" \t\n");
223 String curSent = strtok(NULL,
"\n");
226 curSent.
Split(aWordStrs,
" \t\n");
232 seq.
Set(aWordIDs, pV);
238 if (fTestid.
Good()) {
249 if (strType ==
"exact") {
250 lout_exe <<
"Exact Normalization..." << endl;
253 else if (strType ==
"ais") {
256 lout_exe <<
"[Input] AIS chain number = ";
260 lout_exe <<
"[Input] AIS intermediate distribution number = ";
263 lout_exe <<
"AIS normalization..." << endl;
287 lout <<
"Revise the length distribution pi..." << endl;
292 File file(pathLenFile,
"rt");
295 while (pLine = file.
GetLine()) {
298 char *p = strtok(pLine,
" \t\n");
301 p = strtok(NULL,
" \t\n");
303 nLen = min(nLen, nMaxLen);
#define SAFE_DELETE(p)
memory release
void WordStr2ID(Array< VocabID > &aIDs, Array< String > &aStrs, LHash< const char *, VocabID > &vocabhash)
void ReadT(const char *pfilename)
Read Model.
void ModelRevisePi(Model &m, const char *pathLenFile)
void LMRescore(Model &m, const char *pathTest)
void SetPi(Prob *pPi)
Set the pi.
void Parse(const char *plabel, const char *pvalue)
parse a single option, "pvalue" can be NULL
void Split(Array< String > &aStrs, const char *delimiter)
split to string array. Using strtok().
LogP GetLogProb(Seq &seq, bool bNorm=true)
calculate the probability
double CalculateLL(Model &m, CorpusTxt *pCorpus, int nCorpusNum, double *pPPL=NULL)
lout<< "*********************************************"<< endl;lout<< " TRF.exe "<< endl;lout<< "\"<< __DATE__<< "\"<< __TIME__<< "\"<< endl;lout<< "**********************************************"<< endl;omp_set_num_threads(cfg_nThread);lout<< "[OMP] omp_thread = "<< omp_get_max_threads()<< endl;omp_rand(cfg_nThread);Vocab v(cfg_pathVocab);Model m(&v);lout_exe<< "Read model: "<< cfg_pathModelRead<< endl;m.ReadT(cfg_pathModelRead);if(cfg_norm_method) { ModelNorm(m, cfg_norm_method);} if(cfg_pathLenFile) { ModelRevisePi(m, cfg_pathLenFile);} if(cfg_pathTest) { CorpusTxt *p=new CorpusTxt(cfg_pathTest);double dPPL;double dLL=CalculateLL(m, p, p->GetNum(), &dPPL);lout_exe<< "calculate LL of : "<< cfg_pathTest<< endl;lout_exe<< "-LL = "<< -dLL<< endl;lout_exe<< "PPL = "<< dPPL<< endl;SAFE_DELETE(p);} if(cfg_pathNbest) { LMRescore(m, cfg_pathNbest);} if(cfg_pathModelWrite) { lout_exe<< "Write model: "<< cfg_pathModelWrite<< endl;m.WriteT(cfg_pathModelWrite);} return 1;}double CalculateLL(Model &m, CorpusTxt *pCorpus, int nCorpusNum, double *pPPL){ Array< double > aLL(omp_get_max_threads())
Array< int > aWords(omp_get_max_threads())
void ModelNorm(Model &m, const char *type)
T * GetBuffer(int i=0) const
get the buffer pointer
virtual void Print(const char *p_pMessage,...)
print
T Sum()
summate all the values in the array
define a sequence including the word sequence and class sequence
char * cfg_pathModelWrite
string m_strOtherHelp
extra help information, which will be output in PrintUsage
void Set(Array< int > &aInt, Vocab *pv)
transform the word sequence (form file) to Seq
virtual char * GetLine(bool bPrecent=false)
Read a line into the buffer.
void Progress(long long n=-1, bool bInit=false, long long total=100, const char *head="")
progress bar
int omp_rand(int thread_num)
Array< int > aSents(omp_get_max_threads())
DataT * Insert(KeyT key, bool &bFound)
Insert a value.
char * cfg_writeLmscoreDebug
int GetNum() const
Get Array number.
void Add(ValueType t, const char *pLabel, void *pAddress, const char *pDocMsg=NULL)
Add a option.
int GetSize()
get the vocab size, i.e. the word number
Log lout
the defination is in wb-log.cpp
virtual bool GetSeq(int nLine, Array< VocabID > &aSeq)
get the sequence in nLine
void Fill(T m)
set all the values to m
bool Good() const
return if the file is accessible.
const char * GetWordStr(int id)
get word string
void PrintArray(const char *pformat, TYPE *pbuf, int num)
print a array into file
virtual double ExactNormalize(int nLen)
[exact] Exact Normalization, return the logz of given length
char * GetBuffer() const
get buffer
void WriteT(const char *pfilename)
Write Model.
int GetMaxLen() const
Get max-len.
Get the option from command line or command files.
virtual int GetNum() const
get the seq number
LogP AISNormalize(int nLen, int nChain, int nInter)
perform AIS to calculate the normalization constants, return the logz of given length ...
DataT * Find(KeyT key, bool &bFound)
Find a value.