17 while (pLine = file.
GetLine(
true)) {
21 char *pWord = strtok(pLine,
" \t");
23 aWords.
Add(atoi(pWord));
24 pWord = strtok(NULL,
" \t");
29 for (
int i = 0; i < aWords.
GetNum(); i++) {
37 for (
int i = 0; i < aWords.
GetNum() - 1; i++) {
43 idx[0] = aWords[i + 1];
46 if (!bFound) *pCount = 1;
62 lout <<
"Load Tagvocab: " << pTagVocab << endl;
71 while (pCount = iter.
Next(w)) {
75 while (heap.
OutTop(w, n)) {
103 lout <<
"Update class Unigram" << endl;
107 while (pCount = iter.
Next(w)) {
112 lout <<
"Update class Bigram" << endl;
119 while (pSub = iter2.
Next()) {
140 lout <<
"Prepare Sum" << endl;
143 while (pCount = iterW.
Next(w)) {
154 while (pCount = iter.
Next(w)) {
155 file.
Print(
"%d\t%d\n", w, *pCount);
163 while (pSub = iter.
Next()) {
166 file.
Print(
"%d %d\t%d\n", gram[1], gram[0], *(pSub->
GetData()));
168 file.
Print(
"%d %d\t%d\n", gram[0], gram[1], *(pSub->
GetData()));
174 File file(path,
"wt");
192 File file(path,
"wt");
194 file.
Print(
"[%d]\t", i);
195 for (
int n = 0; n < aClass[i]->
GetNum(); n++) {
197 int w = aClass[i]->
Get(n);
199 int ncount = (pcount == NULL) ? 0 : *pcount;
201 file.
Print(
"%d{%d} ", aClass[i]->Get(n), ncount);
212 File file(path,
"wt");
219 File file(path,
"rt");
222 fscanf(file,
"%d\t%d %d\n", &g, &w, &c);
232 double dSumClassGram = 0;
233 double dSumClass = 0;
250 for (
int j = 0; j < m_nClassNum + 1; j++) {
253 lout_error(
"classGramCount (" << n <<
") < 0")
256 dSumClassGram += 1.0 * n /
m_nSentNum * log((
double)n);
267 while (pCount = iterC.
Next(c)) {
270 dSumClass += 1.0 * (*pCount) /
m_nSentNum * log((
double)(*pCount));
287 int sig = (bOut) ? -1 : 1;
305 while (p = iter.
Next(g[0])) {
315 while (p = iter.
Next(g[1])) {
362 while (p = iter.
Next(v)) {
382 while (p = iter.
Next(v)) {
412 bool bChange =
false;
439 double dOptValue = -1e22;
441 for (
int c = 0; c < nTotalSwitchClassNum; c++)
448 if (dCurValue > dOptValue) {
449 dOptValue = dCurValue;
457 if (nOptClass != nOldClass) {
458 lout <<
"[exchange_" << nTimes + 1 <<
"] " << w <<
" class_" << nOldClass <<
" -> class_" << nOptClass <<
" value=" << dOptValue << endl;
460 if (nOptClass >= nTotalSwitchClassNum) {
461 lout_error(
"[cluster] 未定义的to class-id (" << nOptClass <<
") for word (" << w <<
") ");
481 lout << i <<
"[" << aNum[i] <<
"] ";
493 if (nTimes == nMaxTime) {
494 lout <<
"[end] Get Max Times" << endl;
498 lout <<
"[end] No Change" << endl;
515 nTemp = (int)sqrt((
double)(*p));
521 int w, count, preCount = -1;
522 while (heap.
OutTop(w, count)) {
525 if (count != preCount) {
543 lout <<
"Write to " << path << endl;
546 if (m_word_count.Find(w)) {
547 f.
Print(
"%d\t%d\n", w, m_mMap[w]);
553 lout <<
"Read from" << path << endl;
557 int wid = atoi(strtok(pLine,
" \t\n"));
558 int cid = atoi(strtok(NULL,
" \t\n"));
571 File file(path,
"rt");
573 while (pLine = file.
GetLine(
true)) {
577 char *pWord = strtok(pLine,
" \t");
579 aWords.
Add(atoi(pWord));
580 pWord = strtok(NULL,
" \t");
585 for (
int i = 0; i < aWords.
GetNum(); i++) {
587 int *pCount = m_word_count.Insert(aWords[i], bFound);
595 for (
int i = 0; i < aWords.
GetNum() - 1; i++) {
598 key[1] = aWords[i + 1];
600 int *pCount = m_wgram_count.
Insert(key, 2, bFound);
608 pCount = m_inv_wgram_count.Insert(key, 2, bFound);
621 lout <<
"The word_num(" << m_word_count.GetNum() <<
") < class_num(" <<
m_nClassNum <<
")" << endl;
622 lout <<
"no need to cluster!!" << endl;
630 lout <<
"Init the class from file: " << path_init_res << endl;
631 ReadRes(path_init_res);
634 lout <<
"Init the class based unigram count" << endl;
637 int *pCount = m_word_count.Find(w);
647 while (heap.
OutTop(w, n)) {
653 UpdateCount(m_mCountBuf);
669 while (pCount = hash_iter.
Next(wid)) {
670 CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount);
677 for (keys[0] = 0; keys[0] <
m_nClassNum; keys[0]++) {
678 for (keys[1] = 0; keys[1] <
m_nClassNum; keys[1]++) {
679 CountAdd(aCountBuf, m_class_gram, keys, 2, 0);
683 for (keys[1] = 0; keys[1] <
m_nClassNum; keys[1]++) {
684 CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0);
685 CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0);
693 while (pSub = trie_iter.
Next()) {
698 keys[0] = m_mMap[wgram[0]];
699 keys[1] = m_mMap[wgram[1]];
700 CountAdd(aCountBuf, m_class_gram, keys, 2, count);
701 keys[0] = m_mMap[wgram[0]];
704 CountAdd(aCountBuf, m_class_word_gram, keys, 2, count);
706 keys[1] = m_mMap[wgram[1]];
707 CountAdd(aCountBuf, m_word_class_gram, keys, 2, count);
713 lout <<
"Prepare Sum" << endl;
716 while (pCount = iterW.
Next(wid)) {
721 lout <<
"Total Class Count Buf = " << aCountBuf.
GetNum() << endl;
723 CopyCountToThreads(aCountBuf);
728 int *pIdx = hash.
Insert(key, bFound);
730 *pIdx = aCountBuf.
GetNum();
731 aCountBuf[*pIdx] = 0;
733 aCountBuf[*pIdx] += count;
738 int *pIdx = hash.
Insert(pKey, nLen, bFound);
740 *pIdx = aCountBuf.
GetNum();
741 aCountBuf[*pIdx] = 0;
743 aCountBuf[*pIdx] += count;
748 int *pIdx = hash.
Find(key, bFound);
750 lout_error(
"[CountAdd] no find the hash key=" << key);
752 aCountBuf[*pIdx] += count;
757 int *pIdx = hash.
Find(pKey, nLen, bFound);
759 lout_error(
"[CountAdd] no find the trie key=" << pKey[0]);
761 aCountBuf[*pIdx] += count;
765 int nThread = omp_get_max_threads();
768 m_tCountBuf.Reset(nThread, aCountBuf.
GetNum());
769 for (
int t = 0; t < nThread; t++) {
770 memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.
GetBuffer(),
sizeof(aCountBuf[0])*aCountBuf.
GetNum());
774 for (
int t = 0; t < nThread; t++) {
775 memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(),
sizeof(m_mMap[0])*m_mMap.GetNum());
780 if (m_word_count.Find(nWord) == NULL)
783 int nClass = vMap[
nWord];
784 int sig = (bOut) ? -1 : 1;
785 int tid = omp_get_thread_num();
788 pCount = m_word_count.Find(nWord);
789 CountAdd(vCountBuf, m_class, nClass, sig *(*pCount));
800 while (p = iter.
Next(g[0])) {
801 int count = vCountBuf[*p->
GetData()];
802 CountAdd(vCountBuf, m_class_gram, g, 2, sig*count);
807 pSub = m_word_class_gram.
FindTrie(&nWord, 1);
811 while (p = iter.
Next(g[1])) {
812 int count = vCountBuf[*p->
GetData()];
813 CountAdd(vCountBuf, m_class_gram, g, 2, sig*count);
821 pCount = m_wgram_count.Find(w, 2);
823 CountAdd(vCountBuf, m_class_gram, g, 2, *pCount);
832 pSub = m_inv_wgram_count.
FindTrie(&nWord, 1);
836 while (p = iter.
Next(v)) {
840 CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount));
844 pSub = m_wgram_count.
FindTrie(&nWord, 1);
848 while (p = iter.
Next(v)) {
852 CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount));
858 if (nToClass == vMap[nWord])
861 MoveWord(vCountBuf, vMap, nWord,
true);
862 vMap[
nWord] = nToClass;
863 MoveWord(vCountBuf, vMap, nWord,
false);
867 int nThread = omp_get_max_threads();
874 if (m_word_count.Find(w)) {
878 lout <<
"[Cluster] max-thread = " << nThread << endl;
879 lout <<
"[Cluster] observed word = " << aWordList.
GetNum() << endl;
880 lout <<
"[Cluster] Begin..." << endl;
882 double dPreValue = -1e22;
885 for (
int t = 0; t < nMaxTime; t++) {
886 bool bChange =
false;
887 for (
int block = 0; block < aWordList.
GetNum() / nThread; block++) {
889 CopyCountToThreads(m_mCountBuf);
890 #pragma omp parallel for 891 for (
int i = block*nThread; i < min(aWordList.
GetNum(), (block + 1)*nThread); i++) {
892 int w = aWordList[i];
899 double dOptValue = -1e22;
919 if (dCurValue > dOptValue) {
920 dOptValue = dCurValue;
925 aWordPreThread[omp_get_thread_num()] = w;
926 aOptClassPreThread[omp_get_thread_num()] = nOptClass;
930 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum());
932 for (
int i = 0; i < nThread; i++) {
934 int w = aWordPreThread[i];
935 int c_old = main_map[w];
936 int c_new = aOptClassPreThread[i];
947 if (c_old != c_new) {
948 lout <<
"[exchange " << t <<
"] w=" << w
949 <<
" from class_" << c_old
950 <<
" to class_" << c_new
951 <<
" LL=" << dCurValue << endl;
962 aClassContent.
Fill(0);
964 aClassContent[m_mMap[w]]++;
966 lout <<
"[exchange " << t <<
" end] ";
968 lout << c <<
"[" << aClassContent[c] <<
"] ";
972 if (bChange ==
false) {
973 lout <<
"unchange..." << endl;
978 lout <<
"[Cluster] End" << endl;
983 double dSumClassGram = 0;
984 double dSumClass = 0;
990 while (psub = trie_iter2.
Next()) {
991 int count = vCountBuf[*psub->
GetData()];
993 dSumClassGram += 1.0 * count /
m_nSentNum * log((
double)count);
1014 while (pIdx = iterC.
Next(c)) {
1015 int count = vCountBuf[*pIdx];
1017 dSumClass += 1.0 * count /
m_nSentNum * log((
double)count);
1020 double dRes = dSumClassGram - 2 * dSumClass +
m_dWordLogSum;
1029 int *p = m_word_count.Find(w);
1034 nTemp = (int)sqrt((
double)(*p));
1040 int w, count, preCount = -1;
1041 while (heap.
OutTop(w, count)) {
1044 if (count != preCount) {
_wb_TRIE * Next()
Get next trie.
void Cluster(int nMaxTime=-1)
#define SAFE_DELETE(p)
memory release
DataT * Next(KeyT &key)
get next value
void CopyCountToThreads(Array< int > &aCountBuf)
T & Get(int i)
get the value at position i
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
bool IsDataLegal()
detect if current trie have legal value
void Read_TagVocab(const char *path)
DataT * Find(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Find a value.
void WriteRes_WordClass(const char *path)
Get all the values whose indexes are of a fixed length. The returned tries may not contain a legal va...
void InitCount(const char *path, const char *pTagVocab=NULL)
void InitCount(const char *path, const char *path_init_res=NULL)
void WriteRes_TagVocab(const char *path)
void ExchangeWord(VecShell< int > vCountBuf, VecShell< int > vMap, int nWord, int nToClass)
exchange the nWord form m_aClass[nWord] to nToClass
Array< int > aWords(omp_get_max_threads())
void WriteRes(const char *path)
T * GetBuffer(int i=0) const
get the buffer pointer
void MoveWord(int nWord, bool bOut=true)
void WriteCount(LHash< int, int > &count, File &file)
void Insert(TValue p_value, TWeight p_w)
insert a value
virtual void Print(const char *p_pMessage,...)
print
int GetNum() const
Get the unit number.
DataT * Insert(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Insert a value.
_wb_TRIE * FindTrie(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Find a sub-trie.
void Insert(T t)
insert a value. Avoid repeating
Array< int > m_aClass
记录每个词w所在的类g
void ReadRes(const char *path)
#define HEAPMODE_MAXHEAP
max heap, used to sort from large to small
void SetNum(int n)
Set Array number, to melloc enough memory.
virtual char * GetLine(bool bPrecent=false)
Read a line into the buffer.
void Progress(long long n=-1, bool bInit=false, long long total=100, const char *head="")
progress bar
_wb_TRIE * Next(KeyT &key)
Get next sub-trie.
void MoveWord(VecShell< int > vCountBuf, VecShell< int > vMap, int nWord, bool bOut=true)
move word in/out of a class and update the counts
DataT * Insert(KeyT key, bool &bFound)
Insert a value.
void Clean()
Clean the array. Just set the top of array to -1 and donot release the memory.
LHash< int, int > m_wordCount
N(w)
Trie< int, int > m_wordGramCount
N(w,v)
double LogLikelihood(VecShell< int > vCountBuf)
claculate the Loglikelihood
int GetNum() const
Get Array number.
void Add(T t)
Add a value to the tail of array.
void CountAdd(Array< int > &aCountBuf, LHash< int, int > &hash, int key, int count)
Log lout
the defination is in wb-log.cpp
DataT * GetData()
Get value.
Trie< int, int > m_wordClassCount
N(w,g), 储存时,w在前,g在后
void Fill(T m)
set all the values to m
Trie< int, int > m_invWordGram
储存每个w的前继,不计数,仅用于索引每个w的前继v
void WriteRes_ClassWord(const char *path)
LHash< int, int > m_classCount
N(g)
int m_nVocabSize
word-id的个数
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
void ExchangeWord(int nWord, int nToClass)
exchange the nWord form m_aClass[nWord] to nToClass
Trie< int, int > m_classWordCount
N(g,w), 储存时,w在前,g在后
bool OutTop(TValue &p_value, TWeight &p_w)
out the top
void Fill(DataT d)
set all the values to d
void UpdateCount(Array< int > &aCountBuf)
void CountAdd(LHash< int, int > &count, int nWord, int nAdd)
double m_dWordLogSum
记录sum{N(w)logN(w)} ,因为仅仅需要计算一次
void Fill(DataT d)
Set all the values to d.
define all the code written by Bin Wang.
void Cluster(int nMaxTime=-1)
cluster
int ** m_pClassGramCount
N(g_w,g_v);.
DataT * Find(KeyT key, bool &bFound)
Find a value.