29 for (
int i = 0; i <
nLen; i++) {
42 for (
int i = 0; i <
nLen; i++) {
47 for (
int i = 0; i <
nLen; i++) {
54 for (
int i = 0; i <
x.
GetRow(); i++) {
61 for (
int i = 0; i <
x.
GetRow(); i++) {
73 int nLen = strlen(pstyle);
74 for (
int i = 0; i <
nLen;) {
78 fd.
n = atoi(pstyle + i);
79 while (pstyle[i] !=
']' && i < nLen) {
85 lout_error(
"[FeatStyle] Set: illegal style : " << pstyle);
95 if (m_nOrder > nOrder) {
100 for (
int nfield = 0; nfield < m_aFields.GetNum(); nfield++) {
101 Field info = m_aFields[nfield];
102 for (
int i = 0; i < info.
n; i++) {
104 key.
Add(seq.
x[info.
i][nCur]);
117 m_aKey.SetNum(omp_get_max_threads());
118 for (
int i = 0; i < m_aKey.GetNum(); i++) {
137 m_aStyleInfo.
Clean();
139 int nLen = strlen(pstyle);
140 const char *pColon = strchr(pstyle,
':');
142 m_aStyleInfo.Add() =
new FeatStyle(pstyle);
145 int nBeg = pColon - pstyle;
146 while (pstyle[nBeg] !=
'[') {
149 int nEnd = pColon - pstyle;
150 while (pstyle[nEnd] !=
']') {
154 String sub(pstyle + nBeg, nEnd - nBeg + 1);
155 int iBeg = atoi(pstyle + nBeg + 1);
156 int iEnd = atoi(pColon + 1);
157 for (
int i = iBeg; i <= iEnd; i++) {
167 for (
int i = 0; i < m_aStyleInfo.GetNum(); i++) {
168 m_nMinOrder = min(m_nMinOrder, m_aStyleInfo[i]->m_nOrder);
169 m_nMaxOrder = max(m_nMaxOrder, m_aStyleInfo[i]->m_nOrder);
174 for (
int pos = 0; pos < seq.
GetLen(); pos++) {
176 for (
int i = 0; i < m_aStyleInfo.GetNum(); i++) {
177 if (
false == m_aStyleInfo[i]->GetKey(seq, pos, seq.
GetLen()-pos, key) )
184 if (!bFound) *pInt = 0;
187 if (m_aStyleInfo[i]->m_nOrder == m_nMaxOrder)
195 if (!bFound) *pInt = 0;
199 if (pos + m_aStyleInfo[i]->m_nOrder == seq.
GetLen()) {
202 int *pInt = sub->
Insert(&endid, 1, bFound);
203 if (!bFound) *pInt = 0;
212 bool bNeedCutoff =
false;
213 if (m_aCutoff.GetNum() > 0) {
214 for (
int i = 0; i < m_aCutoff.GetNum(); i++) {
215 if (m_aCutoff[i] > 1) {
230 for (
int n = 1; n <= m_nMaxOrder + 1; n++) {
231 int nCut = m_aCutoff[min(n - 1, m_aCutoff.GetNum() - 1)];
236 while (psub = iter.
Next()) {
238 if (nCount >= nCut) {
239 *pNewTrie->
Insert(ngram, n) = nCount;
256 for (
int n = 1; n <= m_nMaxOrder+1; n++) {
261 while (psub = iter.
Next()) {
266 m_nNum = nid - begIdx;
271 if (pValue) afeat.
Add(*pValue);
287 pValue = psub->
Find(&endid, 1);
295 Array<int> *pKey = m_aKey[omp_get_thread_num()];
296 for (
int i = 0; i < m_aStyleInfo.GetNum(); i++) {
297 if (m_aStyleInfo[i]->m_nOrder == order) {
299 m_aStyleInfo[i]->GetKey(seq, pos, order, *pKey);
301 Find(afeat, *pKey, pos == 0, pos + order == seq.
GetLen());
309 for (
int pos = 0; pos < seq.
GetLen(); pos++)
311 for (
int i = 0; i < m_aStyleInfo.GetNum(); i++)
313 if (
false == m_aStyleInfo[i]->GetKey(seq, pos, seq.
GetLen() - pos, key) )
316 Find(afeat, key, pos == 0, pos + m_aStyleInfo[i]->m_nOrder == seq.
GetLen());
324 Array<int> *pkey = m_aKey[omp_get_thread_num()];
325 for (
int i = 0; i < m_aStyleInfo.GetNum(); i++) {
326 int order = m_aStyleInfo[i]->m_nOrder;
327 for (
int n = max(0, pos - order + 1); n <= min(nLen - order, pos); n++) {
328 lout_assert(
true == m_aStyleInfo[i]->GetKey(seq, n, order, *pkey));
329 Find(afeat, *pkey, n == 0, n + order == nLen);
336 Reset(strtok(pLine,
" \t\n"));
337 sscanf(strtok(NULL,
" \t\n"),
"order=[%d,%d]", &m_nMinOrder, &m_nMaxOrder);
338 sscanf(strtok(NULL,
" \t\n"),
"num=%d", &m_nNum);
340 for (
int i = 0; i < m_nNum; i++) {
342 strtok(pLine,
" \t\n");
344 int nid = atoi(strchr(strtok(NULL,
" \t\n"),
'=') + 1);
345 PValue v = atof(strchr(strtok(NULL,
" \t\n"),
'=') + 1);
346 char *p = strchr(strtok(NULL,
""),
'=') + 1;
349 p = strtok(p,
" \t\n");
352 p = strtok(NULL,
" \t\n");
354 *m_ptrie->Insert(key, key.
GetNum()) = nid;
362 file.
Print(
"%s order=[%d,%d] num=%d\n", m_style.GetBuffer(), m_nMinOrder, m_nMaxOrder, m_nNum);
365 for (
int n = 1; n <= m_nMaxOrder+1; n++) {
369 while (psub = iter.
Next()) {
372 file.
Print(
"%s\t id=%d value=%f \t key=", m_style.GetBuffer(), nid, (pValue)? pValue[nid]:0);
373 for (
int i = 0; i < n; i++) {
374 file.
Print(
"%d ", ngram[i]);
394 for (
int nSkipLen = 1; nSkipLen <= nOrder - 2; nSkipLen++) {
395 for (
int nSkipBeg = 1; nSkipBeg <= nOrder - nSkipLen - 1; nSkipBeg++) {
396 m_aTable.Add() =
new FeatTable(str.
Format(
"w[%d]-[%d]w[%d]", nSkipBeg, nSkipLen, nOrder - nSkipLen - nSkipBeg));
398 m_aTable.Add() =
new FeatTable(str.
Format(
"c[%d]-[%d]c[%d]", nSkipBeg, nSkipLen, nOrder - nSkipLen - nSkipBeg));
407 lout <<
"[Feat] Reset: Load feat style form file = " << pfeatType << endl;
410 File file(pfeatType,
"rt");
412 while (pLine = file.
GetLine()) {
414 char *p = strstr(pLine,
"//");
417 p = strtok(pLine,
" \t\n");
423 p = strtok(NULL,
" \t\n");
432 m_aTable.Add(pFeatTable);
438 for (
int i = 0; i < m_aTable.GetNum(); i++) {
439 nMaxOrder = max(nMaxOrder, m_aTable[i]->GetMaxOrder());
445 for (
int i = 0; i < m_aTable.GetNum(); i++) {
446 m_aTable[i]->Find(afeat, seq, pos, order);
451 for (
int i = 0; i < m_aTable.GetNum(); i++) {
452 m_aTable[i]->Find(afeat, seq);
457 for (
int i = 0; i < m_aTable.GetNum(); i++) {
458 if (NULL == strchr(m_aTable[i]->GetStyle(),
'w')) {
460 m_aTable[i]->Find(afeat, seq, pos, order);
466 for (
int i = 0; i < m_aTable.GetNum(); i++) {
467 if (strchr(m_aTable[i]->GetStyle(),
'w')) {
469 m_aTable[i]->Find(afeat, seq, pos, order);
482 for (
int i = 0; i < m_aTable.GetNum(); i++) {
483 m_aTable[i]->FindPosDep(afeat, seq, pos);
487 for (
int i = 0; i < m_aTable.GetNum(); i++) {
488 if (NULL == strchr(m_aTable[i]->GetStyle(),
'w')) {
489 m_aTable[i]->FindPosDep(afeat, seq, pos);
494 for (
int i = 0; i < m_aTable.GetNum(); i++) {
495 if (strchr(m_aTable[i]->GetStyle(),
'w')) {
496 m_aTable[i]->FindPosDep(afeat, seq, pos);
501 lout_error(
"[Feat] FindPosDep: unknown type = " << type);
507 File file(path,
"rt");
512 while (pLine = file.
GetLine()) {
516 char *p = strtok(pLine,
" \t\n");
519 p = strtok(NULL,
" \t\n");
526 for (
int i = 0; i < m_aTable.GetNum(); i++) {
527 m_aTable[i]->LoadFeat(seq);
533 lout <<
"[Feat] Feat cutoff..." << endl;
534 int nTotalCutNum = 0;
535 for (
int i = 0; i < m_aTable.GetNum(); i++) {
536 nTotalCutNum += m_aTable[i]->CutoffFeat();
538 lout <<
"[Feat] Feat cutoff num = " << nTotalCutNum << endl;
540 lout <<
"[Feat] Feat index..." << endl;
542 for (
int i = 0; i < m_aTable.GetNum(); i++) {
543 m_aTable[i]->IndexFeat(m_nTotalNum);
544 m_nTotalNum += m_aTable[i]->GetNum();
548 lout <<
"[Feat] = {" << endl;
549 for (
int i = 0; i < m_aTable.GetNum(); i++) {
550 lout <<
" " << m_aTable[i]->GetStyle() <<
": " << m_aTable[i]->GetNum();
551 lout <<
" order=[" << m_aTable[i]->GetMinOrder() <<
"," << m_aTable[i]->GetMaxOrder() <<
"]";
553 lout.
output(m_aTable[i]->m_aCutoff.GetBuffer(), m_aTable[i]->m_aCutoff.GetNum(),
"") << endl;
555 lout <<
" total = " << m_nTotalNum <<
"\n}" << endl;
560 file.
Print(
"feat-type=%d\n", m_aTable.GetNum());
561 file.
Print(
"feat={ ");
562 for (
int i = 0; i < m_aTable.GetNum(); i++) {
563 file.
Print(
"%s ", m_aTable[i]->GetStyle());
567 for (
int i = 0; i < m_aTable.GetNum(); i++) {
568 m_aTable[i]->WriteT(file, pValue);
576 fscanf(file,
"feat-type=%d\n", &nNum);
578 m_aTable.SetNum(nNum);
579 for (
int i = 0; i < m_aTable.GetNum(); i++) {
581 m_aTable[i]->ReadT(file, pValue);
585 lout <<
"[Feat] = {" << endl;
586 for (
int i = 0; i < m_aTable.GetNum(); i++) {
587 lout <<
" " << m_aTable[i]->GetStyle() <<
": " << m_aTable[i]->GetNum();
588 lout <<
" order=[" << m_aTable[i]->GetMinOrder() <<
"," << m_aTable[i]->GetMaxOrder() <<
"]" << endl;
590 lout <<
" total = " << m_nTotalNum <<
"\n}" << endl;
char c
the charactor "w" or "c"
void Reset(const char *pstyle)
Reset.
_wb_TRIE * Next()
Get next trie.
void FindWord(Array< int > &afeat, Seq &seq, int pos, int order)
Find the ngram feature depending on word[pos].
#define SAFE_DELETE(p)
memory release
const char * Format(const char *p_pMessage,...)
format print to string
T & Get(int i)
get the value at position i
int GetMaxOrder()
Get maximum order.
void SetClass(Vocab *pv)
set the class based the word sequence
void Random(Vocab *pv)
Random.
DataT * Find(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Find a value.
Analyse a determinate feat style (without ":")
_wb_TRIE * InsertTrie(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Insert a sub-trie.
short n
number, "w[2]" then n=2
Get all the values whose indexes are of a fixed length. The returned tries may not contain a legal va...
void LoadFeatFromCorpus(const char *path, Vocab *pv)
Load Features from corpus.
define the feature style. such as "w3"(word-3gram); "c2"(class-2gram);
VocabID GetClass(VocabID wid)
get class
Log & output(T *pArray, int n, const char *pgap=" ")
output an array
void ReadT(File &file, PValue *pValue=NULL)
Read form file.
void Reset(int p_len)
reset only change the len variable, does not change the buffer size.
void WriteT(File &file, PValue *pValue=NULL)
Write the features.
T * GetBuffer(int i=0) const
get the buffer pointer
String Replace(const char *src, const char *rpl)
replace
VocabID RandClass()
random a class
virtual void Print(const char *p_pMessage,...)
print
void Find(Array< int > &afeat, Array< int > &key, bool bBeg, bool bEnd)
Find the corresponding feature using a key. This will return the beg/end ngram.
Array< int > m_aCutoff
cutoff setting for different order
define a sequence including the word sequence and class sequence
DataT * Insert(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Insert a value.
short i
if "w" then i=0, if "c" then i=1, used to index the value in Seq
_wb_TRIE * FindTrie(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Find a sub-trie.
void Insert(T t)
insert a value. Avoid repeating
void Set(const char *pstyle)
set and analyze the style
void ReadT(File &file, PValue *pValue=NULL)
Read the features.
void Set(Array< int > &aInt, Vocab *pv)
transform the word sequence (form file) to Seq
virtual char * GetLine(bool bPrecent=false)
Read a line into the buffer.
int CutoffFeat()
cutoff the features
void Progress(long long n=-1, bool bInit=false, long long total=100, const char *head="")
progress bar
void FindPosDep(Array< int > &afeat, Seq &seq, int pos, int type=0)
Find the class ngram depending on the nPos.
void WriteT(File &file, PValue *pValue=NULL)
Write to file.
void Clean()
Clean the array. Just set the top of array to -1 and donot release the memory.
void IndexFeat(int begIdx)
Set the number of each features. We should index the feature in defferent tables. ...
void Find(Array< int > &afeat, Seq &seq, int pos, int order)
Find the ngram feature with a fixed order.
Array< int > * GetWord(VocabID cid)
get word belonging to a class
void Reset(int nOrder, bool bClass)
Reset, set the order. Node: the maximum order (including the skip) may be larger than nOrder...
int GetNum() const
Get Array number.
void Add(T t)
Add a value to the tail of array.
void FindClass(Array< int > &afeat, Seq &seq, int pos, int order)
Find the class ngram feature with a fixed order.
FeatTable(const char *pstyle="")
constructor
Log lout
the defination is in wb-log.cpp
DataT * GetData()
Get value.
void PrintArray(const char *pformat, TYPE *pbuf, int num)
print a array into file
bool GetKey(Seq &seq, int nPos, int nOrder, Array< int > &key)
map a ngram to the index key, return if get a correct key
void FindPosDep(Array< int > &afeat, Seq &seq, int pos)
Find all the feature depending on position.
void LoadFeat(Seq &seq)
Extract a feature from a sequence.
#define SAFE_DEL_POINTER_ARRAY(a)