TRF Language Model
wb-word-cluster.cpp
Go to the documentation of this file.
1 #include "wb-word-cluster.h"
2 #include <omp.h>
3 
4 namespace wb
5 {
6  void WordCluster::InitCount(const char *path, const char *pTagVocab)
7  {
8  bool bFound;
9  File file(path, "rt");
10  char *pLine = NULL;
11  m_nSentNum = 0;
12  m_nVocabSize = 0;
13 
14  m_nUnigramNum = 0;
15  m_nBigramNum = 0;
16 
17  while (pLine = file.GetLine(true)) {
18 
20 
21  char *pWord = strtok(pLine, " \t");
22  while (pWord) {
23  aWords.Add(atoi(pWord));
24  pWord = strtok(NULL, " \t");
25  }
26 
27  m_nSentNum += 1;
28  //Unigram
29  for (int i = 0; i < aWords.GetNum(); i++) {
30  m_nVocabSize = max(m_nVocabSize, aWords[i] + 1);
31 
32  int *pCount = m_wordCount.Insert(aWords[i], bFound);
33  if (!bFound) { *pCount = 1; m_nUnigramNum++; }
34  else (*pCount)++;
35  }
36  //Bigram
37  for (int i = 0; i < aWords.GetNum() - 1; i++) {
38  int *pCount = m_wordGramCount.Insert(aWords.GetBuffer(i), 2, bFound);
39  if (!bFound) { *pCount = 1; m_nBigramNum++; }
40  else (*pCount)++;
41 
42  int idx[3];
43  idx[0] = aWords[i + 1];
44  idx[1] = aWords[i];
45  pCount = m_invWordGram.Insert(idx, 2, bFound);
46  if (!bFound) *pCount = 1;
47  else (*pCount)++;
48  }
49  }
50 
51 
57 
60 
61  if (pTagVocab) {
62  lout << "Load Tagvocab: " << pTagVocab << endl;
63  Read_TagVocab(pTagVocab);
64  return;
65  }
66  //对Unigram排序,初始分类 Heap<int, int> heap(HEAPMODE_MAXHEAP); int w, n; int *pCount; LHashIter<int, int> iter(&m_wordCount); while (pCount = iter.Next(w)) { heap.Insert(w, *pCount); } int nClass = 0; while (heap.OutTop(w, n)) { //前m_nClass-2个word,每个word一个类;后边的所有出现的class赋予m_nClassNum-2类;未出现的赋予m_nClass-1类 if (nClass <= m_nClassNum - 1) m_aClass[w] = nClass; else m_aClass[w] = m_nClassNum - 1; //m_aClass[w] = (nClass >= m_nClassNum-2)? m_nClassNum-2: nClass; nClass++; } //WriteCount(m_wordGramCount, wbFile("test.count", "wt")); //计算类别相关的count UpdataCount(); } void WordCluster::UpdataCount() { //wbFile test("a.dbg", "wt"); //清空count数据 m_classCount.Fill(0); //m_classGramCount.Fill(0); for (int i = 0; i < m_nClassNum + 1; i++) memset(m_pClassGramCount[i], 0, sizeof(int)*(m_nClassNum + 1)); m_classWordCount.Fill(0); m_wordClassCount.Fill(0); int w, n; int *pCount; //计算类别相关的count //Unigram class lout << "Update class Unigram" << endl; LHashIter<int, int> iter(&m_wordCount); int nTempTimes = 0; lout.Progress(0, true, m_nUnigramNum, "Update Unigram:"); while (pCount = iter.Next(w)) { CountAdd(m_classCount, m_aClass[w], *pCount); lout.Progress(++nTempTimes); } //Bigram class lout << "Update class Bigram" << endl; int gram[10]; int g[10]; Trie<int, int> *pSub; TrieIter2<int, int> iter2(&m_wordGramCount, gram, 2); nTempTimes = 0; lout.Progress(0, true, m_nBigramNum, "Update Bigram:"); while (pSub = iter2.Next()) { if (!pSub->IsDataLegal()) continue; //test.Print("%d %d\t%d\n", gram[0], gram[1], *(pSub->GetData())); g[0] = m_aClass[gram[0]]; g[1] = m_aClass[gram[1]]; CountAdd(m_pClassGramCount, g, 2, *(pSub->GetData())); g[0] = m_aClass[gram[0]]; g[1] = gram[1]; Reverse(g); //将word储存在前,为了方便遍历某个word的有效前级class CountAdd(m_classWordCount, g, 2, *(pSub->GetData())); g[0] = gram[0]; g[1] = m_aClass[gram[1]]; CountAdd(m_wordClassCount, g, 2, *(pSub->GetData())); lout.Progress(++nTempTimes); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_wordCount); while (pCount = iterW.Next(w)) { lout_assert(*pCount >= 0); if (*pCount != 0) m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } } void WordCluster::WriteCount(LHash<int, int> &count, File &file) { LHashIter<int, int> iter(&count); int w; int *pCount; while (pCount = iter.Next(w)) { file.Print("%d\t%d\n", w, *pCount); } } void WordCluster::WriteCount(Trie<int, int> &count, File &file, bool bReverse /*=false*/) { int gram[10]; Trie<int, int> *pSub; TrieIter2<int, int> iter(&count, gram, 2); while (pSub = iter.Next()) { if (pSub->IsDataLegal()) { if (bReverse) file.Print("%d %d\t%d\n", gram[1], gram[0], *(pSub->GetData())); else file.Print("%d %d\t%d\n", gram[0], gram[1], *(pSub->GetData())); } } } void WordCluster::WriteRes_WordClass(const char *path) { File file(path, "wt"); for (int i = 0; i < m_aClass.GetNum(); i++) { file.Print("%d\t%d\n", i, m_aClass[i]); } } void WordCluster::WriteRes_ClassWord(const char *path) { Array<Array<int>*> aClass(m_nClassNum); for (int i = 0; i < m_nClassNum + 1; i++) aClass[i] = new Array<int>(); int w; for (w = 0; w < m_nVocabSize; w++) { aClass[m_aClass[w]]->Add(w); } File file(path, "wt"); for (int i = 0; i < m_nClassNum; i++) { file.Print("[%d]\t", i); for (int n = 0; n < aClass[i]->GetNum(); n++) { int w = aClass[i]->Get(n); int *pcount = m_wordCount.Find(w); int ncount = (pcount == NULL) ? 0 : *pcount; file.Print("%d{%d} ", aClass[i]->Get(n), ncount); //打印出现count } file.Print("\n"); } for (int i = 0; i < m_nClassNum; i++) SAFE_DELETE(aClass[i]); } void WordCluster::WriteRes_TagVocab(const char *path) { File file(path, "wt"); for (int i = 0; i < m_aClass.GetNum(); i++) { file.Print("%d\t%d %d\n", i, i, m_aClass[i]); } } void WordCluster::Read_TagVocab(const char *path) { File file(path, "rt"); for (int i = 0; i < m_aClass.GetNum(); i++) { int g, w, c; fscanf(file, "%d\t%d %d\n", &g, &w, &c); if (g != i) { lout_error("read TagVocab ERROR!!"); } m_aClass[g] = c; } UpdataCount(); } double WordCluster::LogLikelihood() { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; // Sum{ N(g_v,g_w)logN(g_v,g_w) } // int g[10]; // wbTrie<int,int> *pSub; // wbTrieIter2<int,int> iter2(m_classGramCount, g, 2); // while (pSub = iter2.Next()) { // if (!pSub->IsDataLegal()) // continue; // // int n = *(pSub->GetData()); // lout_assert( n>= 0 ); // if ( n != 0 ) { // dSumClassGram += 1.0 * n/m_nSentNum * log((double)n); // } // } for (int i = 0; i < m_nClassNum + 1; i++) { for (int j = 0; j < m_nClassNum + 1; j++) { int n = m_pClassGramCount[i][j]; if (n < 0) { lout_error("classGramCount (" << n << ") < 0") } if (n != 0) { dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); } } } //Sum { N(g)logN(g) } int c, w; int *pCount; LHashIter<int, int> iterC(&m_classCount); while (pCount = iterC.Next(c)) { lout_assert(*pCount >= 0); if (*pCount != 0) dSumClass += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } //Sum { N(w)logN(w) } // wbLHashIter<int,int> iterW(&m_wordCount); // while (pCount = iterW.Next(w)) { // lout_assert(*pCount>=0); // if ( *pCount != 0 ) // dSumWord += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); // } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster::MoveWord(int nWord, bool bOut /* = true */) { int nClass = m_aClass[nWord]; int sig = (bOut) ? -1 : 1; // class unigram int *pCount = m_wordCount.Find(nWord); if (pCount == NULL) return; CountAdd(m_classCount, nClass, sig*(*pCount)); // class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_classWordCount.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { CountAdd(m_pClassGramCount, g, 2, sig*(*p->GetData())); } } g[0] = nClass; pSub = m_wordClassCount.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { CountAdd(m_pClassGramCount, g, 2, sig*(*p->GetData())); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wordGramCount.Find(w, 2); if (pCount) { CountAdd(m_pClassGramCount, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; // int a1=0, a2=0; // int b1=0, b2=0; // wbLHashIter<int,int> vocabIter(&m_wordCount); // while (vocabIter.Next(v)) { //遍历所有的词v // g[0] = v; // // w[0] = v; // w[1] = nWord; // pCount = m_wordGramCount.Find(w, 2); // if (pCount) { // a1 ++; // CountAdd(m_wordClassCount, g, 2, sig*(*pCount)); // } // // w[0] = nWord; // w[1] = v; // pCount = m_wordGramCount.Find(w, 2); // if (pCount) { // b1 ++; // CountAdd(m_classWordCount, g, 2, sig*(*pCount)); // } // } //遍历nWord的前继 pSub = m_invWordGram.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; w[0] = v; w[1] = nWord; // pCount = m_wordGramCount.Find(w, 2); // if ( *pCount != *(p->GetData()) ) { // lout_error("ERROR_test"); // } pCount = p->GetData(); if (pCount) { //a2++; CountAdd(m_wordClassCount, g, 2, sig*(*pCount)); } } } //遍历nWord后继 pSub = m_wordGramCount.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; pCount = p->GetData(); if (pCount) { //b2++; CountAdd(m_classWordCount, g, 2, sig*(*pCount)); } } } // lout_variable(a1); // lout_variable(a2); // lout_variable(b1); // lout_variable(b2); // Pause(); } void WordCluster::ExchangeWord(int nWord, int nToClass) { if (nToClass == m_aClass[nWord]) return; MoveWord(nWord, true); // move out from nClass m_aClass[nWord] = nToClass; MoveWord(nWord, false); // move into nToClass // m_aClass[nWord] = nToClass; // UpdataCount(); } void WordCluster::Cluster(int nMaxTime /* = -1 */) { bool bChange = false; int nTimes = 0; lout_variable(m_nVocabSize); int nTotalSwitchClassNum = m_nClassNum; //将未出现的word分到同一类 for (int w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) { m_aClass[w] = m_nClassNum - 1; ///< 赋予最后一个类 nTotalSwitchClassNum = m_nClassNum - 1; } } while (1) //外层循环 { bChange = false; int w; // wbLHashIter<int,int> vocabIter(&m_wordCount); // while (vocabIter.Next(w)) //遍历每个word for (w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) continue; int nOldClass = m_aClass[w]; //保存目前的class int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < nTotalSwitchClassNum; c++) //转移到每个class { ExchangeWord(w, c); double dCurValue = LogLikelihood(); //替换后的负对数似然值 // lout << w << " to class_" << c << " ll=" << dCurValue << endl; // Pause(); if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } #ifdef _DEBUG //lout<<"class_"<<nOldClass<<" -> class_"<<nOptClass<<endl; #endif ExchangeWord(w, nOptClass); if (nOptClass != nOldClass) {//改变了类 lout << "[exchange_" << nTimes + 1 << "] " << w << " class_" << nOldClass << " -> class_" << nOptClass << " value=" << dOptValue << endl; bChange = true; if (nOptClass >= nTotalSwitchClassNum) { lout_error("[cluster] 未定义的to class-id (" << nOptClass << ") for word (" << w << ") "); } /*Pause();*/ } } lout_variable(LogLikelihood()); //统计下每个class中的词数目 Array<int> aNum(m_nClassNum); aNum.Fill(0); // vocabIter.Init(); // while (vocabIter.Next(w)) { for (w = 0; w < m_nVocabSize; w++) { if (m_aClass[w] >= m_nClassNum) { lout_error("[cluster] 未定义的class-id (" << m_aClass[w] << ") for word (" << w << ") "); } aNum[m_aClass[w]] ++; } for (int i = 0; i < m_nClassNum; i++) lout << i << "[" << aNum[i] << "] "; lout << endl; //打印当前的结果 if (m_pathWordClass) WriteRes_WordClass(m_pathWordClass); if (m_pathClassWord) WriteRes_ClassWord(m_pathClassWord); if (m_pathTagVocab) WriteRes_TagVocab(m_pathTagVocab); nTimes++; if (nTimes == nMaxTime) { lout << "[end] Get Max Times" << endl; break; } if (!bChange) { lout << "[end] No Change" << endl; break; } } } void WordCluster::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_wordCount.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_aClass[w] = m_nClassNum - 1; else m_aClass[w] = n; } } void WordCluster_t::WriteRes(const char *path) { lout << "Write to " << path << endl; File f(path, "wt"); for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { f.Print("%d\t%d\n", w, m_mMap[w]); } } } void WordCluster_t::ReadRes(const char *path) { lout << "Read from" << path << endl; File f(path, "rt"); char *pLine; while (pLine = f.GetLine()) { int wid = atoi(strtok(pLine, " \t\n")); int cid = atoi(strtok(NULL, " \t\n")); m_mMap[wid] = cid; } } void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */) { m_nSentNum = 0; m_nVocabSize = 0; m_nUnigramNum = 0; m_nBigramNum = 0; bool bFound; File file(path, "rt"); char *pLine = NULL; while (pLine = file.GetLine(true)) { Array<int> aWords; char *pWord = strtok(pLine, " \t"); while (pWord) { aWords.Add(atoi(pWord)); pWord = strtok(NULL, " \t"); } m_nSentNum += 1; //Unigram for (int i = 0; i < aWords.GetNum(); i++) { m_nVocabSize = max(m_nVocabSize, aWords[i] + 1); int *pCount = m_word_count.Insert(aWords[i], bFound); if (!bFound) { *pCount = 0; m_nUnigramNum++; } *pCount += 1; } //Bigram for (int i = 0; i < aWords.GetNum() - 1; i++) { int key[3]; key[0] = aWords[i]; key[1] = aWords[i + 1]; int *pCount = m_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; m_nBigramNum++; } *pCount += 1; Reverse(key); pCount = m_inv_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; } *pCount += 1; } } lout_variable(m_word_count.GetNum()); lout_variable(m_nVocabSize); lout_variable(m_nClassNum); lout_variable(m_nUnigramNum); lout_variable(m_nBigramNum); if (m_word_count.GetNum() < m_nClassNum) { lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl; lout << "no need to cluster!!" << endl; exit(1); } m_mMap.SetNum(m_nVocabSize); m_mMap.Fill(m_nVocabSize-1); if (path_init_res) { lout << "Init the class from file: " << path_init_res << endl; ReadRes(path_init_res); } else { lout << "Init the class based unigram count" << endl; Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *pCount = m_word_count.Find(w); if (pCount) { heap.Insert(w, *pCount); } else { heap.Insert(w, 0); // zero count } } int w, n; int nClass = 0; while (heap.OutTop(w, n)) { m_mMap[w] = min(nClass, m_nClassNum - 1); nClass++; } } WriteRes(m_pathRes); UpdateCount(m_mCountBuf); // lout_variable(m_dWordLogSum); // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()))); // cluster.InitCount(path); // lout_variable(cluster.m_dWordLogSum); // lout_variable(cluster.LogLikelihood()); } void WordCluster_t::UpdateCount(Array<int> &aCountBuf) { aCountBuf.Clean(); int *pCount; int wid; // class unigram LHashIter<int, int> hash_iter(&m_word_count); while (pCount = hash_iter.Next(wid)) { CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount); } // class bigram // add all the class bigram to buffer int wgram[10]; int keys[10]; for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_class_gram, keys, 2, 0); } } for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0); CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0); } } // add the count of bigram Trie<int, int> *pSub; TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2); lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:"); while (pSub = trie_iter.Next()) { if (!pSub->IsDataLegal()) continue; int count = *pSub->GetData(); keys[0] = m_mMap[wgram[0]]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_class_gram, keys, 2, count); keys[0] = m_mMap[wgram[0]]; keys[1] = wgram[1]; Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class CountAdd(aCountBuf, m_class_word_gram, keys, 2, count); keys[0] = wgram[0]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_word_class_gram, keys, 2, count); lout.Progress(); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_word_count); while (pCount = iterW.Next(wid)) { lout_assert(*pCount > 0); m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl; // allocate the buffer of each thread CopyCountToThreads(aCountBuf); } void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Insert(key, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Insert(pKey, nLen, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Find(key, bFound); if (!pIdx) { lout_error("[CountAdd] no find the hash key=" << key); } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Find(pKey, nLen, bFound); if (!pIdx) { lout_error("[CountAdd] no find the trie key=" << pKey[0]); } aCountBuf[*pIdx] += count; } void WordCluster_t::CopyCountToThreads(Array<int> &aCountBuf) { int nThread = omp_get_max_threads(); // copy count m_tCountBuf.Reset(nThread, aCountBuf.GetNum()); for (int t = 0; t < nThread; t++) { memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum()); } // copy class map m_tMap.Reset(nThread, m_nVocabSize); for (int t = 0; t < nThread; t++) { memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum()); } } void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */) { if (m_word_count.Find(nWord) == NULL) return; // no such word in the count int nClass = vMap[nWord]; int sig = (bOut) ? -1 : 1; int tid = omp_get_thread_num(); int *pCount; /// class unigram pCount = m_word_count.Find(nWord); CountAdd(vCountBuf, m_class, nClass, sig *(*pCount)); /// class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; pSub = m_word_class_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wgram_count.Find(w, 2); if (pCount) { CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
68  int w, n;
69  int *pCount;
71  while (pCount = iter.Next(w)) {
72  heap.Insert(w, *pCount);
73  }
74  int nClass = 0;
75  while (heap.OutTop(w, n)) { //前m_nClass-2个word,每个word一个类;后边的所有出现的class赋予m_nClassNum-2类;未出现的赋予m_nClass-1类
76  if (nClass <= m_nClassNum - 1)
77  m_aClass[w] = nClass;
78  else
79  m_aClass[w] = m_nClassNum - 1;
80  //m_aClass[w] = (nClass >= m_nClassNum-2)? m_nClassNum-2: nClass;
81  nClass++;
82  }
83  //WriteCount(m_wordGramCount, wbFile("test.count", "wt"));
84  //计算类别相关的count
85  UpdataCount();
86  }
87 
89  {
90  //wbFile test("a.dbg", "wt");
91  //清空count数据 m_classCount.Fill(0); //m_classGramCount.Fill(0); for (int i = 0; i < m_nClassNum + 1; i++) memset(m_pClassGramCount[i], 0, sizeof(int)*(m_nClassNum + 1)); m_classWordCount.Fill(0); m_wordClassCount.Fill(0); int w, n; int *pCount; //计算类别相关的count //Unigram class lout << "Update class Unigram" << endl; LHashIter<int, int> iter(&m_wordCount); int nTempTimes = 0; lout.Progress(0, true, m_nUnigramNum, "Update Unigram:"); while (pCount = iter.Next(w)) { CountAdd(m_classCount, m_aClass[w], *pCount); lout.Progress(++nTempTimes); } //Bigram class lout << "Update class Bigram" << endl; int gram[10]; int g[10]; Trie<int, int> *pSub; TrieIter2<int, int> iter2(&m_wordGramCount, gram, 2); nTempTimes = 0; lout.Progress(0, true, m_nBigramNum, "Update Bigram:"); while (pSub = iter2.Next()) { if (!pSub->IsDataLegal()) continue; //test.Print("%d %d\t%d\n", gram[0], gram[1], *(pSub->GetData())); g[0] = m_aClass[gram[0]]; g[1] = m_aClass[gram[1]]; CountAdd(m_pClassGramCount, g, 2, *(pSub->GetData())); g[0] = m_aClass[gram[0]]; g[1] = gram[1]; Reverse(g); //将word储存在前,为了方便遍历某个word的有效前级class CountAdd(m_classWordCount, g, 2, *(pSub->GetData())); g[0] = gram[0]; g[1] = m_aClass[gram[1]]; CountAdd(m_wordClassCount, g, 2, *(pSub->GetData())); lout.Progress(++nTempTimes); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_wordCount); while (pCount = iterW.Next(w)) { lout_assert(*pCount >= 0); if (*pCount != 0) m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } } void WordCluster::WriteCount(LHash<int, int> &count, File &file) { LHashIter<int, int> iter(&count); int w; int *pCount; while (pCount = iter.Next(w)) { file.Print("%d\t%d\n", w, *pCount); } } void WordCluster::WriteCount(Trie<int, int> &count, File &file, bool bReverse /*=false*/) { int gram[10]; Trie<int, int> *pSub; TrieIter2<int, int> iter(&count, gram, 2); while (pSub = iter.Next()) { if (pSub->IsDataLegal()) { if (bReverse) file.Print("%d %d\t%d\n", gram[1], gram[0], *(pSub->GetData())); else file.Print("%d %d\t%d\n", gram[0], gram[1], *(pSub->GetData())); } } } void WordCluster::WriteRes_WordClass(const char *path) { File file(path, "wt"); for (int i = 0; i < m_aClass.GetNum(); i++) { file.Print("%d\t%d\n", i, m_aClass[i]); } } void WordCluster::WriteRes_ClassWord(const char *path) { Array<Array<int>*> aClass(m_nClassNum); for (int i = 0; i < m_nClassNum + 1; i++) aClass[i] = new Array<int>(); int w; for (w = 0; w < m_nVocabSize; w++) { aClass[m_aClass[w]]->Add(w); } File file(path, "wt"); for (int i = 0; i < m_nClassNum; i++) { file.Print("[%d]\t", i); for (int n = 0; n < aClass[i]->GetNum(); n++) { int w = aClass[i]->Get(n); int *pcount = m_wordCount.Find(w); int ncount = (pcount == NULL) ? 0 : *pcount; file.Print("%d{%d} ", aClass[i]->Get(n), ncount); //打印出现count } file.Print("\n"); } for (int i = 0; i < m_nClassNum; i++) SAFE_DELETE(aClass[i]); } void WordCluster::WriteRes_TagVocab(const char *path) { File file(path, "wt"); for (int i = 0; i < m_aClass.GetNum(); i++) { file.Print("%d\t%d %d\n", i, i, m_aClass[i]); } } void WordCluster::Read_TagVocab(const char *path) { File file(path, "rt"); for (int i = 0; i < m_aClass.GetNum(); i++) { int g, w, c; fscanf(file, "%d\t%d %d\n", &g, &w, &c); if (g != i) { lout_error("read TagVocab ERROR!!"); } m_aClass[g] = c; } UpdataCount(); } double WordCluster::LogLikelihood() { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; // Sum{ N(g_v,g_w)logN(g_v,g_w) } // int g[10]; // wbTrie<int,int> *pSub; // wbTrieIter2<int,int> iter2(m_classGramCount, g, 2); // while (pSub = iter2.Next()) { // if (!pSub->IsDataLegal()) // continue; // // int n = *(pSub->GetData()); // lout_assert( n>= 0 ); // if ( n != 0 ) { // dSumClassGram += 1.0 * n/m_nSentNum * log((double)n); // } // } for (int i = 0; i < m_nClassNum + 1; i++) { for (int j = 0; j < m_nClassNum + 1; j++) { int n = m_pClassGramCount[i][j]; if (n < 0) { lout_error("classGramCount (" << n << ") < 0") } if (n != 0) { dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); } } } //Sum { N(g)logN(g) } int c, w; int *pCount; LHashIter<int, int> iterC(&m_classCount); while (pCount = iterC.Next(c)) { lout_assert(*pCount >= 0); if (*pCount != 0) dSumClass += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } //Sum { N(w)logN(w) } // wbLHashIter<int,int> iterW(&m_wordCount); // while (pCount = iterW.Next(w)) { // lout_assert(*pCount>=0); // if ( *pCount != 0 ) // dSumWord += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); // } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster::MoveWord(int nWord, bool bOut /* = true */) { int nClass = m_aClass[nWord]; int sig = (bOut) ? -1 : 1; // class unigram int *pCount = m_wordCount.Find(nWord); if (pCount == NULL) return; CountAdd(m_classCount, nClass, sig*(*pCount)); // class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_classWordCount.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { CountAdd(m_pClassGramCount, g, 2, sig*(*p->GetData())); } } g[0] = nClass; pSub = m_wordClassCount.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { CountAdd(m_pClassGramCount, g, 2, sig*(*p->GetData())); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wordGramCount.Find(w, 2); if (pCount) { CountAdd(m_pClassGramCount, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; // int a1=0, a2=0; // int b1=0, b2=0; // wbLHashIter<int,int> vocabIter(&m_wordCount); // while (vocabIter.Next(v)) { //遍历所有的词v // g[0] = v; // // w[0] = v; // w[1] = nWord; // pCount = m_wordGramCount.Find(w, 2); // if (pCount) { // a1 ++; // CountAdd(m_wordClassCount, g, 2, sig*(*pCount)); // } // // w[0] = nWord; // w[1] = v; // pCount = m_wordGramCount.Find(w, 2); // if (pCount) { // b1 ++; // CountAdd(m_classWordCount, g, 2, sig*(*pCount)); // } // } //遍历nWord的前继 pSub = m_invWordGram.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; w[0] = v; w[1] = nWord; // pCount = m_wordGramCount.Find(w, 2); // if ( *pCount != *(p->GetData()) ) { // lout_error("ERROR_test"); // } pCount = p->GetData(); if (pCount) { //a2++; CountAdd(m_wordClassCount, g, 2, sig*(*pCount)); } } } //遍历nWord后继 pSub = m_wordGramCount.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; pCount = p->GetData(); if (pCount) { //b2++; CountAdd(m_classWordCount, g, 2, sig*(*pCount)); } } } // lout_variable(a1); // lout_variable(a2); // lout_variable(b1); // lout_variable(b2); // Pause(); } void WordCluster::ExchangeWord(int nWord, int nToClass) { if (nToClass == m_aClass[nWord]) return; MoveWord(nWord, true); // move out from nClass m_aClass[nWord] = nToClass; MoveWord(nWord, false); // move into nToClass // m_aClass[nWord] = nToClass; // UpdataCount(); } void WordCluster::Cluster(int nMaxTime /* = -1 */) { bool bChange = false; int nTimes = 0; lout_variable(m_nVocabSize); int nTotalSwitchClassNum = m_nClassNum; //将未出现的word分到同一类 for (int w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) { m_aClass[w] = m_nClassNum - 1; ///< 赋予最后一个类 nTotalSwitchClassNum = m_nClassNum - 1; } } while (1) //外层循环 { bChange = false; int w; // wbLHashIter<int,int> vocabIter(&m_wordCount); // while (vocabIter.Next(w)) //遍历每个word for (w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) continue; int nOldClass = m_aClass[w]; //保存目前的class int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < nTotalSwitchClassNum; c++) //转移到每个class { ExchangeWord(w, c); double dCurValue = LogLikelihood(); //替换后的负对数似然值 // lout << w << " to class_" << c << " ll=" << dCurValue << endl; // Pause(); if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } #ifdef _DEBUG //lout<<"class_"<<nOldClass<<" -> class_"<<nOptClass<<endl; #endif ExchangeWord(w, nOptClass); if (nOptClass != nOldClass) {//改变了类 lout << "[exchange_" << nTimes + 1 << "] " << w << " class_" << nOldClass << " -> class_" << nOptClass << " value=" << dOptValue << endl; bChange = true; if (nOptClass >= nTotalSwitchClassNum) { lout_error("[cluster] 未定义的to class-id (" << nOptClass << ") for word (" << w << ") "); } /*Pause();*/ } } lout_variable(LogLikelihood()); //统计下每个class中的词数目 Array<int> aNum(m_nClassNum); aNum.Fill(0); // vocabIter.Init(); // while (vocabIter.Next(w)) { for (w = 0; w < m_nVocabSize; w++) { if (m_aClass[w] >= m_nClassNum) { lout_error("[cluster] 未定义的class-id (" << m_aClass[w] << ") for word (" << w << ") "); } aNum[m_aClass[w]] ++; } for (int i = 0; i < m_nClassNum; i++) lout << i << "[" << aNum[i] << "] "; lout << endl; //打印当前的结果 if (m_pathWordClass) WriteRes_WordClass(m_pathWordClass); if (m_pathClassWord) WriteRes_ClassWord(m_pathClassWord); if (m_pathTagVocab) WriteRes_TagVocab(m_pathTagVocab); nTimes++; if (nTimes == nMaxTime) { lout << "[end] Get Max Times" << endl; break; } if (!bChange) { lout << "[end] No Change" << endl; break; } } } void WordCluster::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_wordCount.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_aClass[w] = m_nClassNum - 1; else m_aClass[w] = n; } } void WordCluster_t::WriteRes(const char *path) { lout << "Write to " << path << endl; File f(path, "wt"); for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { f.Print("%d\t%d\n", w, m_mMap[w]); } } } void WordCluster_t::ReadRes(const char *path) { lout << "Read from" << path << endl; File f(path, "rt"); char *pLine; while (pLine = f.GetLine()) { int wid = atoi(strtok(pLine, " \t\n")); int cid = atoi(strtok(NULL, " \t\n")); m_mMap[wid] = cid; } } void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */) { m_nSentNum = 0; m_nVocabSize = 0; m_nUnigramNum = 0; m_nBigramNum = 0; bool bFound; File file(path, "rt"); char *pLine = NULL; while (pLine = file.GetLine(true)) { Array<int> aWords; char *pWord = strtok(pLine, " \t"); while (pWord) { aWords.Add(atoi(pWord)); pWord = strtok(NULL, " \t"); } m_nSentNum += 1; //Unigram for (int i = 0; i < aWords.GetNum(); i++) { m_nVocabSize = max(m_nVocabSize, aWords[i] + 1); int *pCount = m_word_count.Insert(aWords[i], bFound); if (!bFound) { *pCount = 0; m_nUnigramNum++; } *pCount += 1; } //Bigram for (int i = 0; i < aWords.GetNum() - 1; i++) { int key[3]; key[0] = aWords[i]; key[1] = aWords[i + 1]; int *pCount = m_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; m_nBigramNum++; } *pCount += 1; Reverse(key); pCount = m_inv_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; } *pCount += 1; } } lout_variable(m_word_count.GetNum()); lout_variable(m_nVocabSize); lout_variable(m_nClassNum); lout_variable(m_nUnigramNum); lout_variable(m_nBigramNum); if (m_word_count.GetNum() < m_nClassNum) { lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl; lout << "no need to cluster!!" << endl; exit(1); } m_mMap.SetNum(m_nVocabSize); m_mMap.Fill(m_nVocabSize-1); if (path_init_res) { lout << "Init the class from file: " << path_init_res << endl; ReadRes(path_init_res); } else { lout << "Init the class based unigram count" << endl; Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *pCount = m_word_count.Find(w); if (pCount) { heap.Insert(w, *pCount); } else { heap.Insert(w, 0); // zero count } } int w, n; int nClass = 0; while (heap.OutTop(w, n)) { m_mMap[w] = min(nClass, m_nClassNum - 1); nClass++; } } WriteRes(m_pathRes); UpdateCount(m_mCountBuf); // lout_variable(m_dWordLogSum); // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()))); // cluster.InitCount(path); // lout_variable(cluster.m_dWordLogSum); // lout_variable(cluster.LogLikelihood()); } void WordCluster_t::UpdateCount(Array<int> &aCountBuf) { aCountBuf.Clean(); int *pCount; int wid; // class unigram LHashIter<int, int> hash_iter(&m_word_count); while (pCount = hash_iter.Next(wid)) { CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount); } // class bigram // add all the class bigram to buffer int wgram[10]; int keys[10]; for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_class_gram, keys, 2, 0); } } for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0); CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0); } } // add the count of bigram Trie<int, int> *pSub; TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2); lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:"); while (pSub = trie_iter.Next()) { if (!pSub->IsDataLegal()) continue; int count = *pSub->GetData(); keys[0] = m_mMap[wgram[0]]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_class_gram, keys, 2, count); keys[0] = m_mMap[wgram[0]]; keys[1] = wgram[1]; Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class CountAdd(aCountBuf, m_class_word_gram, keys, 2, count); keys[0] = wgram[0]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_word_class_gram, keys, 2, count); lout.Progress(); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_word_count); while (pCount = iterW.Next(wid)) { lout_assert(*pCount > 0); m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl; // allocate the buffer of each thread CopyCountToThreads(aCountBuf); } void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Insert(key, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Insert(pKey, nLen, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Find(key, bFound); if (!pIdx) { lout_error("[CountAdd] no find the hash key=" << key); } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Find(pKey, nLen, bFound); if (!pIdx) { lout_error("[CountAdd] no find the trie key=" << pKey[0]); } aCountBuf[*pIdx] += count; } void WordCluster_t::CopyCountToThreads(Array<int> &aCountBuf) { int nThread = omp_get_max_threads(); // copy count m_tCountBuf.Reset(nThread, aCountBuf.GetNum()); for (int t = 0; t < nThread; t++) { memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum()); } // copy class map m_tMap.Reset(nThread, m_nVocabSize); for (int t = 0; t < nThread; t++) { memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum()); } } void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */) { if (m_word_count.Find(nWord) == NULL) return; // no such word in the count int nClass = vMap[nWord]; int sig = (bOut) ? -1 : 1; int tid = omp_get_thread_num(); int *pCount; /// class unigram pCount = m_word_count.Find(nWord); CountAdd(vCountBuf, m_class, nClass, sig *(*pCount)); /// class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; pSub = m_word_class_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wgram_count.Find(w, 2); if (pCount) { CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
92  m_classCount.Fill(0);
93  //m_classGramCount.Fill(0);
94  for (int i = 0; i < m_nClassNum + 1; i++)
95  memset(m_pClassGramCount[i], 0, sizeof(int)*(m_nClassNum + 1));
98 
99  int w, n;
100  int *pCount;
101  //计算类别相关的count
102  //Unigram class
103  lout << "Update class Unigram" << endl;
105  int nTempTimes = 0;
106  lout.Progress(0, true, m_nUnigramNum, "Update Unigram:");
107  while (pCount = iter.Next(w)) {
108  CountAdd(m_classCount, m_aClass[w], *pCount);
109  lout.Progress(++nTempTimes);
110  }
111  //Bigram class
112  lout << "Update class Bigram" << endl;
113  int gram[10];
114  int g[10];
115  Trie<int, int> *pSub;
116  TrieIter2<int, int> iter2(&m_wordGramCount, gram, 2);
117  nTempTimes = 0;
118  lout.Progress(0, true, m_nBigramNum, "Update Bigram:");
119  while (pSub = iter2.Next()) {
120  if (!pSub->IsDataLegal())
121  continue;
122 
123  //test.Print("%d %d\t%d\n", gram[0], gram[1], *(pSub->GetData()));
124 
125  g[0] = m_aClass[gram[0]];
126  g[1] = m_aClass[gram[1]];
127  CountAdd(m_pClassGramCount, g, 2, *(pSub->GetData()));
128  g[0] = m_aClass[gram[0]];
129  g[1] = gram[1];
130  Reverse(g); //将word储存在前,为了方便遍历某个word的有效前级class
131  CountAdd(m_classWordCount, g, 2, *(pSub->GetData()));
132  g[0] = gram[0];
133  g[1] = m_aClass[gram[1]];
134  CountAdd(m_wordClassCount, g, 2, *(pSub->GetData()));
135  lout.Progress(++nTempTimes);
136  }
137 
138 
139  //Sum { N(w)logN(w) }
140  lout << "Prepare Sum" << endl;
141  m_dWordLogSum = 0;
143  while (pCount = iterW.Next(w)) {
144  lout_assert(*pCount >= 0);
145  if (*pCount != 0)
146  m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount));
147  }
148  }
150  {
151  LHashIter<int, int> iter(&count);
152  int w;
153  int *pCount;
154  while (pCount = iter.Next(w)) {
155  file.Print("%d\t%d\n", w, *pCount);
156  }
157  }
158  void WordCluster::WriteCount(Trie<int, int> &count, File &file, bool bReverse /*=false*/)
159  {
160  int gram[10];
161  Trie<int, int> *pSub;
162  TrieIter2<int, int> iter(&count, gram, 2);
163  while (pSub = iter.Next()) {
164  if (pSub->IsDataLegal()) {
165  if (bReverse)
166  file.Print("%d %d\t%d\n", gram[1], gram[0], *(pSub->GetData()));
167  else
168  file.Print("%d %d\t%d\n", gram[0], gram[1], *(pSub->GetData()));
169  }
170  }
171  }
172  void WordCluster::WriteRes_WordClass(const char *path)
173  {
174  File file(path, "wt");
175  for (int i = 0; i < m_aClass.GetNum(); i++) {
176  file.Print("%d\t%d\n", i, m_aClass[i]);
177  }
178  }
179  void WordCluster::WriteRes_ClassWord(const char *path)
180  {
181 
182  Array<Array<int>*> aClass(m_nClassNum);
183  for (int i = 0; i < m_nClassNum + 1; i++)
184  aClass[i] = new Array<int>();
185 
186  int w;
187  for (w = 0; w < m_nVocabSize; w++)
188  {
189  aClass[m_aClass[w]]->Add(w);
190  }
191 
192  File file(path, "wt");
193  for (int i = 0; i < m_nClassNum; i++) {
194  file.Print("[%d]\t", i);
195  for (int n = 0; n < aClass[i]->GetNum(); n++) {
196 
197  int w = aClass[i]->Get(n);
198  int *pcount = m_wordCount.Find(w);
199  int ncount = (pcount == NULL) ? 0 : *pcount;
200 
201  file.Print("%d{%d} ", aClass[i]->Get(n), ncount); //打印出现count
202  }
203  file.Print("\n");
204  }
205 
206 
207  for (int i = 0; i < m_nClassNum; i++)
208  SAFE_DELETE(aClass[i]);
209  }
210  void WordCluster::WriteRes_TagVocab(const char *path)
211  {
212  File file(path, "wt");
213  for (int i = 0; i < m_aClass.GetNum(); i++) {
214  file.Print("%d\t%d %d\n", i, i, m_aClass[i]);
215  }
216  }
217  void WordCluster::Read_TagVocab(const char *path)
218  {
219  File file(path, "rt");
220  for (int i = 0; i < m_aClass.GetNum(); i++) {
221  int g, w, c;
222  fscanf(file, "%d\t%d %d\n", &g, &w, &c);
223  if (g != i) {
224  lout_error("read TagVocab ERROR!!");
225  }
226  m_aClass[g] = c;
227  }
228  UpdataCount();
229  }
231  {
232  double dSumClassGram = 0;
233  double dSumClass = 0;
234  double dSumWord = 0;
235  // Sum{ N(g_v,g_w)logN(g_v,g_w) }
236  // int g[10];
237  // wbTrie<int,int> *pSub;
238  // wbTrieIter2<int,int> iter2(m_classGramCount, g, 2);
239  // while (pSub = iter2.Next()) {
240  // if (!pSub->IsDataLegal())
241  // continue;
242  //
243  // int n = *(pSub->GetData());
244  // lout_assert( n>= 0 );
245  // if ( n != 0 ) {
246  // dSumClassGram += 1.0 * n/m_nSentNum * log((double)n);
247  // }
248  // }
249  for (int i = 0; i < m_nClassNum + 1; i++) {
250  for (int j = 0; j < m_nClassNum + 1; j++) {
251  int n = m_pClassGramCount[i][j];
252  if (n < 0) {
253  lout_error("classGramCount (" << n << ") < 0")
254  }
255  if (n != 0) {
256  dSumClassGram += 1.0 * n / m_nSentNum * log((double)n);
257  }
258  }
259  }
260 
261 
262 
263  //Sum { N(g)logN(g) }
264  int c, w;
265  int *pCount;
267  while (pCount = iterC.Next(c)) {
268  lout_assert(*pCount >= 0);
269  if (*pCount != 0)
270  dSumClass += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount));
271  }
272 
273  //Sum { N(w)logN(w) }
274  // wbLHashIter<int,int> iterW(&m_wordCount);
275  // while (pCount = iterW.Next(w)) {
276  // lout_assert(*pCount>=0);
277  // if ( *pCount != 0 )
278  // dSumWord += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount));
279  // }
280 
281  double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum;
282  return dRes;
283  }
284  void WordCluster::MoveWord(int nWord, bool bOut /* = true */)
285  {
286  int nClass = m_aClass[nWord];
287  int sig = (bOut) ? -1 : 1;
288 
289  // class unigram
290  int *pCount = m_wordCount.Find(nWord);
291  if (pCount == NULL)
292  return;
293  CountAdd(m_classCount, nClass, sig*(*pCount));
294 
295 
296  // class bigram
297  int g[10];
298  int w[10];
299 
300  g[1] = nClass;
301  Trie<int, int> *pSub = m_classWordCount.FindTrie(&nWord, 1);
302  if (pSub) { //遍历所有可能的前继class
303  TrieIter<int, int> iter(pSub);
304  Trie<int, int> *p;
305  while (p = iter.Next(g[0])) {
306  CountAdd(m_pClassGramCount, g, 2, sig*(*p->GetData()));
307  }
308  }
309 
310  g[0] = nClass;
311  pSub = m_wordClassCount.FindTrie(&nWord, 1);
312  if (pSub) { //遍历所有可能的后继class
313  TrieIter<int, int> iter(pSub);
314  Trie<int, int> *p;
315  while (p = iter.Next(g[1])) {
316  CountAdd(m_pClassGramCount, g, 2, sig*(*p->GetData()));
317  }
318  }
319 
320  g[0] = nClass;
321  g[1] = nClass;
322  w[0] = nWord;
323  w[1] = nWord;
324  pCount = m_wordGramCount.Find(w, 2);
325  if (pCount) {
326  CountAdd(m_pClassGramCount, g, 2, *pCount); //加上count
327  }
328 
329  // word class pair
330 
331  int v;
332  g[1] = nClass;
333 
334  // int a1=0, a2=0;
335  // int b1=0, b2=0;
336  // wbLHashIter<int,int> vocabIter(&m_wordCount);
337  // while (vocabIter.Next(v)) { //遍历所有的词v
338  // g[0] = v;
339  //
340  // w[0] = v;
341  // w[1] = nWord;
342  // pCount = m_wordGramCount.Find(w, 2);
343  // if (pCount) {
344  // a1 ++;
345  // CountAdd(m_wordClassCount, g, 2, sig*(*pCount));
346  // }
347  //
348  // w[0] = nWord;
349  // w[1] = v;
350  // pCount = m_wordGramCount.Find(w, 2);
351  // if (pCount) {
352  // b1 ++;
353  // CountAdd(m_classWordCount, g, 2, sig*(*pCount));
354  // }
355  // }
356 
357  //遍历nWord的前继 pSub = m_invWordGram.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; w[0] = v; w[1] = nWord; // pCount = m_wordGramCount.Find(w, 2); // if ( *pCount != *(p->GetData()) ) { // lout_error("ERROR_test"); // } pCount = p->GetData(); if (pCount) { //a2++; CountAdd(m_wordClassCount, g, 2, sig*(*pCount)); } } } //遍历nWord后继 pSub = m_wordGramCount.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; pCount = p->GetData(); if (pCount) { //b2++; CountAdd(m_classWordCount, g, 2, sig*(*pCount)); } } } // lout_variable(a1); // lout_variable(a2); // lout_variable(b1); // lout_variable(b2); // Pause(); } void WordCluster::ExchangeWord(int nWord, int nToClass) { if (nToClass == m_aClass[nWord]) return; MoveWord(nWord, true); // move out from nClass m_aClass[nWord] = nToClass; MoveWord(nWord, false); // move into nToClass // m_aClass[nWord] = nToClass; // UpdataCount(); } void WordCluster::Cluster(int nMaxTime /* = -1 */) { bool bChange = false; int nTimes = 0; lout_variable(m_nVocabSize); int nTotalSwitchClassNum = m_nClassNum; //将未出现的word分到同一类 for (int w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) { m_aClass[w] = m_nClassNum - 1; ///< 赋予最后一个类 nTotalSwitchClassNum = m_nClassNum - 1; } } while (1) //外层循环 { bChange = false; int w; // wbLHashIter<int,int> vocabIter(&m_wordCount); // while (vocabIter.Next(w)) //遍历每个word for (w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) continue; int nOldClass = m_aClass[w]; //保存目前的class int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < nTotalSwitchClassNum; c++) //转移到每个class { ExchangeWord(w, c); double dCurValue = LogLikelihood(); //替换后的负对数似然值 // lout << w << " to class_" << c << " ll=" << dCurValue << endl; // Pause(); if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } #ifdef _DEBUG //lout<<"class_"<<nOldClass<<" -> class_"<<nOptClass<<endl; #endif ExchangeWord(w, nOptClass); if (nOptClass != nOldClass) {//改变了类 lout << "[exchange_" << nTimes + 1 << "] " << w << " class_" << nOldClass << " -> class_" << nOptClass << " value=" << dOptValue << endl; bChange = true; if (nOptClass >= nTotalSwitchClassNum) { lout_error("[cluster] 未定义的to class-id (" << nOptClass << ") for word (" << w << ") "); } /*Pause();*/ } } lout_variable(LogLikelihood()); //统计下每个class中的词数目 Array<int> aNum(m_nClassNum); aNum.Fill(0); // vocabIter.Init(); // while (vocabIter.Next(w)) { for (w = 0; w < m_nVocabSize; w++) { if (m_aClass[w] >= m_nClassNum) { lout_error("[cluster] 未定义的class-id (" << m_aClass[w] << ") for word (" << w << ") "); } aNum[m_aClass[w]] ++; } for (int i = 0; i < m_nClassNum; i++) lout << i << "[" << aNum[i] << "] "; lout << endl; //打印当前的结果 if (m_pathWordClass) WriteRes_WordClass(m_pathWordClass); if (m_pathClassWord) WriteRes_ClassWord(m_pathClassWord); if (m_pathTagVocab) WriteRes_TagVocab(m_pathTagVocab); nTimes++; if (nTimes == nMaxTime) { lout << "[end] Get Max Times" << endl; break; } if (!bChange) { lout << "[end] No Change" << endl; break; } } } void WordCluster::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_wordCount.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_aClass[w] = m_nClassNum - 1; else m_aClass[w] = n; } } void WordCluster_t::WriteRes(const char *path) { lout << "Write to " << path << endl; File f(path, "wt"); for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { f.Print("%d\t%d\n", w, m_mMap[w]); } } } void WordCluster_t::ReadRes(const char *path) { lout << "Read from" << path << endl; File f(path, "rt"); char *pLine; while (pLine = f.GetLine()) { int wid = atoi(strtok(pLine, " \t\n")); int cid = atoi(strtok(NULL, " \t\n")); m_mMap[wid] = cid; } } void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */) { m_nSentNum = 0; m_nVocabSize = 0; m_nUnigramNum = 0; m_nBigramNum = 0; bool bFound; File file(path, "rt"); char *pLine = NULL; while (pLine = file.GetLine(true)) { Array<int> aWords; char *pWord = strtok(pLine, " \t"); while (pWord) { aWords.Add(atoi(pWord)); pWord = strtok(NULL, " \t"); } m_nSentNum += 1; //Unigram for (int i = 0; i < aWords.GetNum(); i++) { m_nVocabSize = max(m_nVocabSize, aWords[i] + 1); int *pCount = m_word_count.Insert(aWords[i], bFound); if (!bFound) { *pCount = 0; m_nUnigramNum++; } *pCount += 1; } //Bigram for (int i = 0; i < aWords.GetNum() - 1; i++) { int key[3]; key[0] = aWords[i]; key[1] = aWords[i + 1]; int *pCount = m_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; m_nBigramNum++; } *pCount += 1; Reverse(key); pCount = m_inv_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; } *pCount += 1; } } lout_variable(m_word_count.GetNum()); lout_variable(m_nVocabSize); lout_variable(m_nClassNum); lout_variable(m_nUnigramNum); lout_variable(m_nBigramNum); if (m_word_count.GetNum() < m_nClassNum) { lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl; lout << "no need to cluster!!" << endl; exit(1); } m_mMap.SetNum(m_nVocabSize); m_mMap.Fill(m_nVocabSize-1); if (path_init_res) { lout << "Init the class from file: " << path_init_res << endl; ReadRes(path_init_res); } else { lout << "Init the class based unigram count" << endl; Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *pCount = m_word_count.Find(w); if (pCount) { heap.Insert(w, *pCount); } else { heap.Insert(w, 0); // zero count } } int w, n; int nClass = 0; while (heap.OutTop(w, n)) { m_mMap[w] = min(nClass, m_nClassNum - 1); nClass++; } } WriteRes(m_pathRes); UpdateCount(m_mCountBuf); // lout_variable(m_dWordLogSum); // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()))); // cluster.InitCount(path); // lout_variable(cluster.m_dWordLogSum); // lout_variable(cluster.LogLikelihood()); } void WordCluster_t::UpdateCount(Array<int> &aCountBuf) { aCountBuf.Clean(); int *pCount; int wid; // class unigram LHashIter<int, int> hash_iter(&m_word_count); while (pCount = hash_iter.Next(wid)) { CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount); } // class bigram // add all the class bigram to buffer int wgram[10]; int keys[10]; for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_class_gram, keys, 2, 0); } } for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0); CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0); } } // add the count of bigram Trie<int, int> *pSub; TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2); lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:"); while (pSub = trie_iter.Next()) { if (!pSub->IsDataLegal()) continue; int count = *pSub->GetData(); keys[0] = m_mMap[wgram[0]]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_class_gram, keys, 2, count); keys[0] = m_mMap[wgram[0]]; keys[1] = wgram[1]; Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class CountAdd(aCountBuf, m_class_word_gram, keys, 2, count); keys[0] = wgram[0]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_word_class_gram, keys, 2, count); lout.Progress(); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_word_count); while (pCount = iterW.Next(wid)) { lout_assert(*pCount > 0); m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl; // allocate the buffer of each thread CopyCountToThreads(aCountBuf); } void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Insert(key, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Insert(pKey, nLen, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Find(key, bFound); if (!pIdx) { lout_error("[CountAdd] no find the hash key=" << key); } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Find(pKey, nLen, bFound); if (!pIdx) { lout_error("[CountAdd] no find the trie key=" << pKey[0]); } aCountBuf[*pIdx] += count; } void WordCluster_t::CopyCountToThreads(Array<int> &aCountBuf) { int nThread = omp_get_max_threads(); // copy count m_tCountBuf.Reset(nThread, aCountBuf.GetNum()); for (int t = 0; t < nThread; t++) { memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum()); } // copy class map m_tMap.Reset(nThread, m_nVocabSize); for (int t = 0; t < nThread; t++) { memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum()); } } void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */) { if (m_word_count.Find(nWord) == NULL) return; // no such word in the count int nClass = vMap[nWord]; int sig = (bOut) ? -1 : 1; int tid = omp_get_thread_num(); int *pCount; /// class unigram pCount = m_word_count.Find(nWord); CountAdd(vCountBuf, m_class, nClass, sig *(*pCount)); /// class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; pSub = m_word_class_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wgram_count.Find(w, 2); if (pCount) { CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
358  pSub = m_invWordGram.FindTrie(&nWord, 1);
359  if (pSub) {
360  TrieIter<int, int> iter(pSub);
361  Trie<int, int> *p;
362  while (p = iter.Next(v)) {
363  g[0] = v;
364  w[0] = v;
365  w[1] = nWord;
366  // pCount = m_wordGramCount.Find(w, 2);
367  // if ( *pCount != *(p->GetData()) ) {
368  // lout_error("ERROR_test");
369  // }
370  pCount = p->GetData();
371  if (pCount) {
372  //a2++;
373  CountAdd(m_wordClassCount, g, 2, sig*(*pCount));
374  }
375  }
376  }
377  //遍历nWord后继 pSub = m_wordGramCount.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; pCount = p->GetData(); if (pCount) { //b2++; CountAdd(m_classWordCount, g, 2, sig*(*pCount)); } } } // lout_variable(a1); // lout_variable(a2); // lout_variable(b1); // lout_variable(b2); // Pause(); } void WordCluster::ExchangeWord(int nWord, int nToClass) { if (nToClass == m_aClass[nWord]) return; MoveWord(nWord, true); // move out from nClass m_aClass[nWord] = nToClass; MoveWord(nWord, false); // move into nToClass // m_aClass[nWord] = nToClass; // UpdataCount(); } void WordCluster::Cluster(int nMaxTime /* = -1 */) { bool bChange = false; int nTimes = 0; lout_variable(m_nVocabSize); int nTotalSwitchClassNum = m_nClassNum; //将未出现的word分到同一类 for (int w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) { m_aClass[w] = m_nClassNum - 1; ///< 赋予最后一个类 nTotalSwitchClassNum = m_nClassNum - 1; } } while (1) //外层循环 { bChange = false; int w; // wbLHashIter<int,int> vocabIter(&m_wordCount); // while (vocabIter.Next(w)) //遍历每个word for (w = 0; w < m_nVocabSize; w++) { if (m_wordCount.Find(w) == NULL) continue; int nOldClass = m_aClass[w]; //保存目前的class int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < nTotalSwitchClassNum; c++) //转移到每个class { ExchangeWord(w, c); double dCurValue = LogLikelihood(); //替换后的负对数似然值 // lout << w << " to class_" << c << " ll=" << dCurValue << endl; // Pause(); if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } #ifdef _DEBUG //lout<<"class_"<<nOldClass<<" -> class_"<<nOptClass<<endl; #endif ExchangeWord(w, nOptClass); if (nOptClass != nOldClass) {//改变了类 lout << "[exchange_" << nTimes + 1 << "] " << w << " class_" << nOldClass << " -> class_" << nOptClass << " value=" << dOptValue << endl; bChange = true; if (nOptClass >= nTotalSwitchClassNum) { lout_error("[cluster] 未定义的to class-id (" << nOptClass << ") for word (" << w << ") "); } /*Pause();*/ } } lout_variable(LogLikelihood()); //统计下每个class中的词数目 Array<int> aNum(m_nClassNum); aNum.Fill(0); // vocabIter.Init(); // while (vocabIter.Next(w)) { for (w = 0; w < m_nVocabSize; w++) { if (m_aClass[w] >= m_nClassNum) { lout_error("[cluster] 未定义的class-id (" << m_aClass[w] << ") for word (" << w << ") "); } aNum[m_aClass[w]] ++; } for (int i = 0; i < m_nClassNum; i++) lout << i << "[" << aNum[i] << "] "; lout << endl; //打印当前的结果 if (m_pathWordClass) WriteRes_WordClass(m_pathWordClass); if (m_pathClassWord) WriteRes_ClassWord(m_pathClassWord); if (m_pathTagVocab) WriteRes_TagVocab(m_pathTagVocab); nTimes++; if (nTimes == nMaxTime) { lout << "[end] Get Max Times" << endl; break; } if (!bChange) { lout << "[end] No Change" << endl; break; } } } void WordCluster::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_wordCount.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_aClass[w] = m_nClassNum - 1; else m_aClass[w] = n; } } void WordCluster_t::WriteRes(const char *path) { lout << "Write to " << path << endl; File f(path, "wt"); for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { f.Print("%d\t%d\n", w, m_mMap[w]); } } } void WordCluster_t::ReadRes(const char *path) { lout << "Read from" << path << endl; File f(path, "rt"); char *pLine; while (pLine = f.GetLine()) { int wid = atoi(strtok(pLine, " \t\n")); int cid = atoi(strtok(NULL, " \t\n")); m_mMap[wid] = cid; } } void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */) { m_nSentNum = 0; m_nVocabSize = 0; m_nUnigramNum = 0; m_nBigramNum = 0; bool bFound; File file(path, "rt"); char *pLine = NULL; while (pLine = file.GetLine(true)) { Array<int> aWords; char *pWord = strtok(pLine, " \t"); while (pWord) { aWords.Add(atoi(pWord)); pWord = strtok(NULL, " \t"); } m_nSentNum += 1; //Unigram for (int i = 0; i < aWords.GetNum(); i++) { m_nVocabSize = max(m_nVocabSize, aWords[i] + 1); int *pCount = m_word_count.Insert(aWords[i], bFound); if (!bFound) { *pCount = 0; m_nUnigramNum++; } *pCount += 1; } //Bigram for (int i = 0; i < aWords.GetNum() - 1; i++) { int key[3]; key[0] = aWords[i]; key[1] = aWords[i + 1]; int *pCount = m_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; m_nBigramNum++; } *pCount += 1; Reverse(key); pCount = m_inv_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; } *pCount += 1; } } lout_variable(m_word_count.GetNum()); lout_variable(m_nVocabSize); lout_variable(m_nClassNum); lout_variable(m_nUnigramNum); lout_variable(m_nBigramNum); if (m_word_count.GetNum() < m_nClassNum) { lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl; lout << "no need to cluster!!" << endl; exit(1); } m_mMap.SetNum(m_nVocabSize); m_mMap.Fill(m_nVocabSize-1); if (path_init_res) { lout << "Init the class from file: " << path_init_res << endl; ReadRes(path_init_res); } else { lout << "Init the class based unigram count" << endl; Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *pCount = m_word_count.Find(w); if (pCount) { heap.Insert(w, *pCount); } else { heap.Insert(w, 0); // zero count } } int w, n; int nClass = 0; while (heap.OutTop(w, n)) { m_mMap[w] = min(nClass, m_nClassNum - 1); nClass++; } } WriteRes(m_pathRes); UpdateCount(m_mCountBuf); // lout_variable(m_dWordLogSum); // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()))); // cluster.InitCount(path); // lout_variable(cluster.m_dWordLogSum); // lout_variable(cluster.LogLikelihood()); } void WordCluster_t::UpdateCount(Array<int> &aCountBuf) { aCountBuf.Clean(); int *pCount; int wid; // class unigram LHashIter<int, int> hash_iter(&m_word_count); while (pCount = hash_iter.Next(wid)) { CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount); } // class bigram // add all the class bigram to buffer int wgram[10]; int keys[10]; for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_class_gram, keys, 2, 0); } } for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0); CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0); } } // add the count of bigram Trie<int, int> *pSub; TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2); lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:"); while (pSub = trie_iter.Next()) { if (!pSub->IsDataLegal()) continue; int count = *pSub->GetData(); keys[0] = m_mMap[wgram[0]]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_class_gram, keys, 2, count); keys[0] = m_mMap[wgram[0]]; keys[1] = wgram[1]; Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class CountAdd(aCountBuf, m_class_word_gram, keys, 2, count); keys[0] = wgram[0]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_word_class_gram, keys, 2, count); lout.Progress(); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_word_count); while (pCount = iterW.Next(wid)) { lout_assert(*pCount > 0); m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl; // allocate the buffer of each thread CopyCountToThreads(aCountBuf); } void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Insert(key, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Insert(pKey, nLen, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Find(key, bFound); if (!pIdx) { lout_error("[CountAdd] no find the hash key=" << key); } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Find(pKey, nLen, bFound); if (!pIdx) { lout_error("[CountAdd] no find the trie key=" << pKey[0]); } aCountBuf[*pIdx] += count; } void WordCluster_t::CopyCountToThreads(Array<int> &aCountBuf) { int nThread = omp_get_max_threads(); // copy count m_tCountBuf.Reset(nThread, aCountBuf.GetNum()); for (int t = 0; t < nThread; t++) { memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum()); } // copy class map m_tMap.Reset(nThread, m_nVocabSize); for (int t = 0; t < nThread; t++) { memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum()); } } void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */) { if (m_word_count.Find(nWord) == NULL) return; // no such word in the count int nClass = vMap[nWord]; int sig = (bOut) ? -1 : 1; int tid = omp_get_thread_num(); int *pCount; /// class unigram pCount = m_word_count.Find(nWord); CountAdd(vCountBuf, m_class, nClass, sig *(*pCount)); /// class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; pSub = m_word_class_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wgram_count.Find(w, 2); if (pCount) { CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
378  pSub = m_wordGramCount.FindTrie(&nWord, 1);
379  if (pSub) {
380  TrieIter<int, int> iter(pSub);
381  Trie<int, int> *p;
382  while (p = iter.Next(v)) {
383  g[0] = v;
384  pCount = p->GetData();
385  if (pCount) {
386  //b2++;
387  CountAdd(m_classWordCount, g, 2, sig*(*pCount));
388  }
389  }
390  }
391 
392  // lout_variable(a1);
393  // lout_variable(a2);
394  // lout_variable(b1);
395  // lout_variable(b2);
396  // Pause();
397  }
398  void WordCluster::ExchangeWord(int nWord, int nToClass)
399  {
400  if (nToClass == m_aClass[nWord])
401  return;
402 
403  MoveWord(nWord, true); // move out from nClass
404  m_aClass[nWord] = nToClass;
405  MoveWord(nWord, false); // move into nToClass
406 
407  // m_aClass[nWord] = nToClass;
408  // UpdataCount();
409  }
410  void WordCluster::Cluster(int nMaxTime /* = -1 */)
411  {
412  bool bChange = false;
413  int nTimes = 0;
414 
415 
417  int nTotalSwitchClassNum = m_nClassNum;
418  //将未出现的word分到同一类
419  for (int w = 0; w < m_nVocabSize; w++) {
420  if (m_wordCount.Find(w) == NULL) {
421  m_aClass[w] = m_nClassNum - 1;
422  nTotalSwitchClassNum = m_nClassNum - 1;
423  }
424  }
425 
426  while (1) //外层循环
427  {
428  bChange = false;
429  int w;
430  // wbLHashIter<int,int> vocabIter(&m_wordCount);
431  // while (vocabIter.Next(w)) //遍历每个word
432  for (w = 0; w < m_nVocabSize; w++)
433  {
434  if (m_wordCount.Find(w) == NULL)
435  continue;
436 
437  int nOldClass = m_aClass[w]; //保存目前的class
438  int nOptClass = -1;
439  double dOptValue = -1e22; //对数似然值
440 
441  for (int c = 0; c < nTotalSwitchClassNum; c++) //转移到每个class
442  {
443  ExchangeWord(w, c);
444  double dCurValue = LogLikelihood(); //替换后的负对数似然值
445 // lout << w << " to class_" << c << " ll=" << dCurValue << endl;
446 // Pause();
447 
448  if (dCurValue > dOptValue) {
449  dOptValue = dCurValue;
450  nOptClass = c;
451  }
452  }
453 #ifdef _DEBUG
454  //lout<<"class_"<<nOldClass<<" -> class_"<<nOptClass<<endl;
455 #endif
456  ExchangeWord(w, nOptClass);
457  if (nOptClass != nOldClass) {//改变了类 lout << "[exchange_" << nTimes + 1 << "] " << w << " class_" << nOldClass << " -> class_" << nOptClass << " value=" << dOptValue << endl; bChange = true; if (nOptClass >= nTotalSwitchClassNum) { lout_error("[cluster] 未定义的to class-id (" << nOptClass << ") for word (" << w << ") "); } /*Pause();*/ } } lout_variable(LogLikelihood()); //统计下每个class中的词数目 Array<int> aNum(m_nClassNum); aNum.Fill(0); // vocabIter.Init(); // while (vocabIter.Next(w)) { for (w = 0; w < m_nVocabSize; w++) { if (m_aClass[w] >= m_nClassNum) { lout_error("[cluster] 未定义的class-id (" << m_aClass[w] << ") for word (" << w << ") "); } aNum[m_aClass[w]] ++; } for (int i = 0; i < m_nClassNum; i++) lout << i << "[" << aNum[i] << "] "; lout << endl; //打印当前的结果 if (m_pathWordClass) WriteRes_WordClass(m_pathWordClass); if (m_pathClassWord) WriteRes_ClassWord(m_pathClassWord); if (m_pathTagVocab) WriteRes_TagVocab(m_pathTagVocab); nTimes++; if (nTimes == nMaxTime) { lout << "[end] Get Max Times" << endl; break; } if (!bChange) { lout << "[end] No Change" << endl; break; } } } void WordCluster::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_wordCount.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_aClass[w] = m_nClassNum - 1; else m_aClass[w] = n; } } void WordCluster_t::WriteRes(const char *path) { lout << "Write to " << path << endl; File f(path, "wt"); for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { f.Print("%d\t%d\n", w, m_mMap[w]); } } } void WordCluster_t::ReadRes(const char *path) { lout << "Read from" << path << endl; File f(path, "rt"); char *pLine; while (pLine = f.GetLine()) { int wid = atoi(strtok(pLine, " \t\n")); int cid = atoi(strtok(NULL, " \t\n")); m_mMap[wid] = cid; } } void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */) { m_nSentNum = 0; m_nVocabSize = 0; m_nUnigramNum = 0; m_nBigramNum = 0; bool bFound; File file(path, "rt"); char *pLine = NULL; while (pLine = file.GetLine(true)) { Array<int> aWords; char *pWord = strtok(pLine, " \t"); while (pWord) { aWords.Add(atoi(pWord)); pWord = strtok(NULL, " \t"); } m_nSentNum += 1; //Unigram for (int i = 0; i < aWords.GetNum(); i++) { m_nVocabSize = max(m_nVocabSize, aWords[i] + 1); int *pCount = m_word_count.Insert(aWords[i], bFound); if (!bFound) { *pCount = 0; m_nUnigramNum++; } *pCount += 1; } //Bigram for (int i = 0; i < aWords.GetNum() - 1; i++) { int key[3]; key[0] = aWords[i]; key[1] = aWords[i + 1]; int *pCount = m_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; m_nBigramNum++; } *pCount += 1; Reverse(key); pCount = m_inv_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; } *pCount += 1; } } lout_variable(m_word_count.GetNum()); lout_variable(m_nVocabSize); lout_variable(m_nClassNum); lout_variable(m_nUnigramNum); lout_variable(m_nBigramNum); if (m_word_count.GetNum() < m_nClassNum) { lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl; lout << "no need to cluster!!" << endl; exit(1); } m_mMap.SetNum(m_nVocabSize); m_mMap.Fill(m_nVocabSize-1); if (path_init_res) { lout << "Init the class from file: " << path_init_res << endl; ReadRes(path_init_res); } else { lout << "Init the class based unigram count" << endl; Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *pCount = m_word_count.Find(w); if (pCount) { heap.Insert(w, *pCount); } else { heap.Insert(w, 0); // zero count } } int w, n; int nClass = 0; while (heap.OutTop(w, n)) { m_mMap[w] = min(nClass, m_nClassNum - 1); nClass++; } } WriteRes(m_pathRes); UpdateCount(m_mCountBuf); // lout_variable(m_dWordLogSum); // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()))); // cluster.InitCount(path); // lout_variable(cluster.m_dWordLogSum); // lout_variable(cluster.LogLikelihood()); } void WordCluster_t::UpdateCount(Array<int> &aCountBuf) { aCountBuf.Clean(); int *pCount; int wid; // class unigram LHashIter<int, int> hash_iter(&m_word_count); while (pCount = hash_iter.Next(wid)) { CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount); } // class bigram // add all the class bigram to buffer int wgram[10]; int keys[10]; for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_class_gram, keys, 2, 0); } } for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0); CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0); } } // add the count of bigram Trie<int, int> *pSub; TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2); lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:"); while (pSub = trie_iter.Next()) { if (!pSub->IsDataLegal()) continue; int count = *pSub->GetData(); keys[0] = m_mMap[wgram[0]]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_class_gram, keys, 2, count); keys[0] = m_mMap[wgram[0]]; keys[1] = wgram[1]; Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class CountAdd(aCountBuf, m_class_word_gram, keys, 2, count); keys[0] = wgram[0]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_word_class_gram, keys, 2, count); lout.Progress(); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_word_count); while (pCount = iterW.Next(wid)) { lout_assert(*pCount > 0); m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl; // allocate the buffer of each thread CopyCountToThreads(aCountBuf); } void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Insert(key, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Insert(pKey, nLen, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Find(key, bFound); if (!pIdx) { lout_error("[CountAdd] no find the hash key=" << key); } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Find(pKey, nLen, bFound); if (!pIdx) { lout_error("[CountAdd] no find the trie key=" << pKey[0]); } aCountBuf[*pIdx] += count; } void WordCluster_t::CopyCountToThreads(Array<int> &aCountBuf) { int nThread = omp_get_max_threads(); // copy count m_tCountBuf.Reset(nThread, aCountBuf.GetNum()); for (int t = 0; t < nThread; t++) { memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum()); } // copy class map m_tMap.Reset(nThread, m_nVocabSize); for (int t = 0; t < nThread; t++) { memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum()); } } void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */) { if (m_word_count.Find(nWord) == NULL) return; // no such word in the count int nClass = vMap[nWord]; int sig = (bOut) ? -1 : 1; int tid = omp_get_thread_num(); int *pCount; /// class unigram pCount = m_word_count.Find(nWord); CountAdd(vCountBuf, m_class, nClass, sig *(*pCount)); /// class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; pSub = m_word_class_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wgram_count.Find(w, 2); if (pCount) { CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
458  lout << "[exchange_" << nTimes + 1 << "] " << w << " class_" << nOldClass << " -> class_" << nOptClass << " value=" << dOptValue << endl;
459  bChange = true;
460  if (nOptClass >= nTotalSwitchClassNum) {
461  lout_error("[cluster] 未定义的to class-id (" << nOptClass << ") for word (" << w << ") ");
462  }
463  /*Pause();*/
464  }
465  }
466 
468  //统计下每个class中的词数目
469  Array<int> aNum(m_nClassNum);
470  aNum.Fill(0);
471  // vocabIter.Init();
472  // while (vocabIter.Next(w)) {
473  for (w = 0; w < m_nVocabSize; w++) {
474  if (m_aClass[w] >= m_nClassNum) {
475  lout_error("[cluster] 未定义的class-id (" << m_aClass[w] << ") for word (" << w << ") ");
476  }
477  aNum[m_aClass[w]] ++;
478  }
479 
480  for (int i = 0; i < m_nClassNum; i++)
481  lout << i << "[" << aNum[i] << "] ";
482  lout << endl;
483 
484  //打印当前的结果 if (m_pathWordClass) WriteRes_WordClass(m_pathWordClass); if (m_pathClassWord) WriteRes_ClassWord(m_pathClassWord); if (m_pathTagVocab) WriteRes_TagVocab(m_pathTagVocab); nTimes++; if (nTimes == nMaxTime) { lout << "[end] Get Max Times" << endl; break; } if (!bChange) { lout << "[end] No Change" << endl; break; } } } void WordCluster::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_wordCount.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_aClass[w] = m_nClassNum - 1; else m_aClass[w] = n; } } void WordCluster_t::WriteRes(const char *path) { lout << "Write to " << path << endl; File f(path, "wt"); for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { f.Print("%d\t%d\n", w, m_mMap[w]); } } } void WordCluster_t::ReadRes(const char *path) { lout << "Read from" << path << endl; File f(path, "rt"); char *pLine; while (pLine = f.GetLine()) { int wid = atoi(strtok(pLine, " \t\n")); int cid = atoi(strtok(NULL, " \t\n")); m_mMap[wid] = cid; } } void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */) { m_nSentNum = 0; m_nVocabSize = 0; m_nUnigramNum = 0; m_nBigramNum = 0; bool bFound; File file(path, "rt"); char *pLine = NULL; while (pLine = file.GetLine(true)) { Array<int> aWords; char *pWord = strtok(pLine, " \t"); while (pWord) { aWords.Add(atoi(pWord)); pWord = strtok(NULL, " \t"); } m_nSentNum += 1; //Unigram for (int i = 0; i < aWords.GetNum(); i++) { m_nVocabSize = max(m_nVocabSize, aWords[i] + 1); int *pCount = m_word_count.Insert(aWords[i], bFound); if (!bFound) { *pCount = 0; m_nUnigramNum++; } *pCount += 1; } //Bigram for (int i = 0; i < aWords.GetNum() - 1; i++) { int key[3]; key[0] = aWords[i]; key[1] = aWords[i + 1]; int *pCount = m_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; m_nBigramNum++; } *pCount += 1; Reverse(key); pCount = m_inv_wgram_count.Insert(key, 2, bFound); if (!bFound) { *pCount = 0; } *pCount += 1; } } lout_variable(m_word_count.GetNum()); lout_variable(m_nVocabSize); lout_variable(m_nClassNum); lout_variable(m_nUnigramNum); lout_variable(m_nBigramNum); if (m_word_count.GetNum() < m_nClassNum) { lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl; lout << "no need to cluster!!" << endl; exit(1); } m_mMap.SetNum(m_nVocabSize); m_mMap.Fill(m_nVocabSize-1); if (path_init_res) { lout << "Init the class from file: " << path_init_res << endl; ReadRes(path_init_res); } else { lout << "Init the class based unigram count" << endl; Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *pCount = m_word_count.Find(w); if (pCount) { heap.Insert(w, *pCount); } else { heap.Insert(w, 0); // zero count } } int w, n; int nClass = 0; while (heap.OutTop(w, n)) { m_mMap[w] = min(nClass, m_nClassNum - 1); nClass++; } } WriteRes(m_pathRes); UpdateCount(m_mCountBuf); // lout_variable(m_dWordLogSum); // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()))); // cluster.InitCount(path); // lout_variable(cluster.m_dWordLogSum); // lout_variable(cluster.LogLikelihood()); } void WordCluster_t::UpdateCount(Array<int> &aCountBuf) { aCountBuf.Clean(); int *pCount; int wid; // class unigram LHashIter<int, int> hash_iter(&m_word_count); while (pCount = hash_iter.Next(wid)) { CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount); } // class bigram // add all the class bigram to buffer int wgram[10]; int keys[10]; for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_class_gram, keys, 2, 0); } } for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) { for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) { CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0); CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0); } } // add the count of bigram Trie<int, int> *pSub; TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2); lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:"); while (pSub = trie_iter.Next()) { if (!pSub->IsDataLegal()) continue; int count = *pSub->GetData(); keys[0] = m_mMap[wgram[0]]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_class_gram, keys, 2, count); keys[0] = m_mMap[wgram[0]]; keys[1] = wgram[1]; Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class CountAdd(aCountBuf, m_class_word_gram, keys, 2, count); keys[0] = wgram[0]; keys[1] = m_mMap[wgram[1]]; CountAdd(aCountBuf, m_word_class_gram, keys, 2, count); lout.Progress(); } //Sum { N(w)logN(w) } lout << "Prepare Sum" << endl; m_dWordLogSum = 0; LHashIter<int, int> iterW(&m_word_count); while (pCount = iterW.Next(wid)) { lout_assert(*pCount > 0); m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount)); } lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl; // allocate the buffer of each thread CopyCountToThreads(aCountBuf); } void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Insert(key, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Insert(pKey, nLen, bFound); if (!bFound) { *pIdx = aCountBuf.GetNum(); aCountBuf[*pIdx] = 0; } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count) { bool bFound; int *pIdx = hash.Find(key, bFound); if (!pIdx) { lout_error("[CountAdd] no find the hash key=" << key); } aCountBuf[*pIdx] += count; } void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count) { bool bFound; int *pIdx = hash.Find(pKey, nLen, bFound); if (!pIdx) { lout_error("[CountAdd] no find the trie key=" << pKey[0]); } aCountBuf[*pIdx] += count; } void WordCluster_t::CopyCountToThreads(Array<int> &aCountBuf) { int nThread = omp_get_max_threads(); // copy count m_tCountBuf.Reset(nThread, aCountBuf.GetNum()); for (int t = 0; t < nThread; t++) { memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum()); } // copy class map m_tMap.Reset(nThread, m_nVocabSize); for (int t = 0; t < nThread; t++) { memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum()); } } void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */) { if (m_word_count.Find(nWord) == NULL) return; // no such word in the count int nClass = vMap[nWord]; int sig = (bOut) ? -1 : 1; int tid = omp_get_thread_num(); int *pCount; /// class unigram pCount = m_word_count.Find(nWord); CountAdd(vCountBuf, m_class, nClass, sig *(*pCount)); /// class bigram int g[10]; int w[10]; g[1] = nClass; Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的前继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[0])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; pSub = m_word_class_gram.FindTrie(&nWord, 1); if (pSub) { //遍历所有可能的后继class TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(g[1])) { int count = vCountBuf[*p->GetData()]; CountAdd(vCountBuf, m_class_gram, g, 2, sig*count); } } g[0] = nClass; g[1] = nClass; w[0] = nWord; w[1] = nWord; pCount = m_wgram_count.Find(w, 2); if (pCount) { CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count } // word class pair int v; g[1] = nClass; //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
485  if (m_pathWordClass)
487  if (m_pathClassWord)
489  if (m_pathTagVocab)
491 
492  nTimes++;
493  if (nTimes == nMaxTime) {
494  lout << "[end] Get Max Times" << endl;
495  break;
496  }
497  if (!bChange) {
498  lout << "[end] No Change" << endl;
499  break;
500  }
501  }
502 
503 
504  }
506  {
507  // 排序
509  for (int w = 0; w < m_nVocabSize; w++) {
510  int *p = m_wordCount.Find(w);
511  int nTemp = 0;;
512  if (p == NULL)
513  nTemp = 0;
514  else
515  nTemp = (int)sqrt((double)(*p));
516  heap.Insert(w, nTemp);
517  }
518 
519 
520  int n = -1;
521  int w, count, preCount = -1;
522  while (heap.OutTop(w, count)) {
523 
524  //确定当前的class
525  if (count != preCount) {
526  preCount = count;
527  n++;
528  }
529 
530 
531  if (n >= m_nClassNum)
532  m_aClass[w] = m_nClassNum - 1;
533  else
534  m_aClass[w] = n;
535 
536  }
537  }
538 
539 
540 
541  void WordCluster_t::WriteRes(const char *path)
542  {
543  lout << "Write to " << path << endl;
544  File f(path, "wt");
545  for (int w = 0; w < m_nVocabSize; w++) {
546  if (m_word_count.Find(w)) {
547  f.Print("%d\t%d\n", w, m_mMap[w]);
548  }
549  }
550  }
551  void WordCluster_t::ReadRes(const char *path)
552  {
553  lout << "Read from" << path << endl;
554  File f(path, "rt");
555  char *pLine;
556  while (pLine = f.GetLine()) {
557  int wid = atoi(strtok(pLine, " \t\n"));
558  int cid = atoi(strtok(NULL, " \t\n"));
559  m_mMap[wid] = cid;
560  }
561  }
562  void WordCluster_t::InitCount(const char *path, const char *path_init_res /* = NULL */)
563  {
564  m_nSentNum = 0;
565  m_nVocabSize = 0;
566  m_nUnigramNum = 0;
567  m_nBigramNum = 0;
568 
569  bool bFound;
570 
571  File file(path, "rt");
572  char *pLine = NULL;
573  while (pLine = file.GetLine(true)) {
574 
576 
577  char *pWord = strtok(pLine, " \t");
578  while (pWord) {
579  aWords.Add(atoi(pWord));
580  pWord = strtok(NULL, " \t");
581  }
582 
583  m_nSentNum += 1;
584  //Unigram
585  for (int i = 0; i < aWords.GetNum(); i++) {
586  m_nVocabSize = max(m_nVocabSize, aWords[i] + 1);
587  int *pCount = m_word_count.Insert(aWords[i], bFound);
588  if (!bFound) {
589  *pCount = 0;
590  m_nUnigramNum++;
591  }
592  *pCount += 1;
593  }
594  //Bigram
595  for (int i = 0; i < aWords.GetNum() - 1; i++) {
596  int key[3];
597  key[0] = aWords[i];
598  key[1] = aWords[i + 1];
599 
600  int *pCount = m_wgram_count.Insert(key, 2, bFound);
601  if (!bFound) {
602  *pCount = 0;
603  m_nBigramNum++;
604  }
605  *pCount += 1;
606 
607  Reverse(key);
608  pCount = m_inv_wgram_count.Insert(key, 2, bFound);
609  if (!bFound) {
610  *pCount = 0;
611  }
612  *pCount += 1;
613  }
614  }
615  lout_variable(m_word_count.GetNum());
620  if (m_word_count.GetNum() < m_nClassNum) {
621  lout << "The word_num(" << m_word_count.GetNum() << ") < class_num(" << m_nClassNum << ")" << endl;
622  lout << "no need to cluster!!" << endl;
623  exit(1);
624  }
625 
626  m_mMap.SetNum(m_nVocabSize);
627  m_mMap.Fill(m_nVocabSize-1);
628 
629  if (path_init_res) {
630  lout << "Init the class from file: " << path_init_res << endl;
631  ReadRes(path_init_res);
632  }
633  else {
634  lout << "Init the class based unigram count" << endl;
636  for (int w = 0; w < m_nVocabSize; w++) {
637  int *pCount = m_word_count.Find(w);
638  if (pCount) {
639  heap.Insert(w, *pCount);
640  }
641  else {
642  heap.Insert(w, 0); // zero count
643  }
644  }
645  int w, n;
646  int nClass = 0;
647  while (heap.OutTop(w, n)) {
648  m_mMap[w] = min(nClass, m_nClassNum - 1);
649  nClass++;
650  }
651  }
652  WriteRes(m_pathRes);
653  UpdateCount(m_mCountBuf);
654 // lout_variable(m_dWordLogSum);
655 // lout_variable(LogLikelihood(VecShell<int>(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum())));
656 
657 // cluster.InitCount(path);
658 // lout_variable(cluster.m_dWordLogSum);
659 // lout_variable(cluster.LogLikelihood());
660 
661  }
663  {
664  aCountBuf.Clean();
665  int *pCount;
666  int wid;
667  // class unigram
668  LHashIter<int, int> hash_iter(&m_word_count);
669  while (pCount = hash_iter.Next(wid)) {
670  CountAdd(aCountBuf, m_class, m_mMap[wid], *pCount);
671  }
672  // class bigram
673 
674  // add all the class bigram to buffer
675  int wgram[10];
676  int keys[10];
677  for (keys[0] = 0; keys[0] < m_nClassNum; keys[0]++) {
678  for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) {
679  CountAdd(aCountBuf, m_class_gram, keys, 2, 0);
680  }
681  }
682  for (keys[0] = 0; keys[0] < m_nVocabSize; keys[0]++) {
683  for (keys[1] = 0; keys[1] < m_nClassNum; keys[1]++) {
684  CountAdd(aCountBuf, m_word_class_gram, keys, 2, 0);
685  CountAdd(aCountBuf, m_class_word_gram, keys, 2, 0);
686  }
687  }
688 
689  // add the count of bigram
690  Trie<int, int> *pSub;
691  TrieIter2<int, int> trie_iter(&m_wgram_count, wgram, 2);
692  lout.Progress(0, true, m_nBigramNum-1, "Update Bigram:");
693  while (pSub = trie_iter.Next()) {
694  if (!pSub->IsDataLegal())
695  continue;
696  int count = *pSub->GetData();
697 
698  keys[0] = m_mMap[wgram[0]];
699  keys[1] = m_mMap[wgram[1]];
700  CountAdd(aCountBuf, m_class_gram, keys, 2, count);
701  keys[0] = m_mMap[wgram[0]];
702  keys[1] = wgram[1];
703  Reverse(keys); //将word储存在前,为了方便遍历某个word的有效的前继class
704  CountAdd(aCountBuf, m_class_word_gram, keys, 2, count);
705  keys[0] = wgram[0];
706  keys[1] = m_mMap[wgram[1]];
707  CountAdd(aCountBuf, m_word_class_gram, keys, 2, count);
708 
709  lout.Progress();
710  }
711 
712  //Sum { N(w)logN(w) }
713  lout << "Prepare Sum" << endl;
714  m_dWordLogSum = 0;
715  LHashIter<int, int> iterW(&m_word_count);
716  while (pCount = iterW.Next(wid)) {
717  lout_assert(*pCount > 0);
718  m_dWordLogSum += 1.0 * (*pCount) / m_nSentNum * log((double)(*pCount));
719  }
720 
721  lout << "Total Class Count Buf = " << aCountBuf.GetNum() << endl;
722  // allocate the buffer of each thread
723  CopyCountToThreads(aCountBuf);
724  }
725  void WordCluster_t::CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count)
726  {
727  bool bFound;
728  int *pIdx = hash.Insert(key, bFound);
729  if (!bFound) {
730  *pIdx = aCountBuf.GetNum();
731  aCountBuf[*pIdx] = 0;
732  }
733  aCountBuf[*pIdx] += count;
734  }
735  void WordCluster_t::CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count)
736  {
737  bool bFound;
738  int *pIdx = hash.Insert(pKey, nLen, bFound);
739  if (!bFound) {
740  *pIdx = aCountBuf.GetNum();
741  aCountBuf[*pIdx] = 0;
742  }
743  aCountBuf[*pIdx] += count;
744  }
745  void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count)
746  {
747  bool bFound;
748  int *pIdx = hash.Find(key, bFound);
749  if (!pIdx) {
750  lout_error("[CountAdd] no find the hash key=" << key);
751  }
752  aCountBuf[*pIdx] += count;
753  }
754  void WordCluster_t::CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count)
755  {
756  bool bFound;
757  int *pIdx = hash.Find(pKey, nLen, bFound);
758  if (!pIdx) {
759  lout_error("[CountAdd] no find the trie key=" << pKey[0]);
760  }
761  aCountBuf[*pIdx] += count;
762  }
764  {
765  int nThread = omp_get_max_threads();
766 
767  // copy count
768  m_tCountBuf.Reset(nThread, aCountBuf.GetNum());
769  for (int t = 0; t < nThread; t++) {
770  memcpy(m_tCountBuf[t].GetBuf(), aCountBuf.GetBuffer(), sizeof(aCountBuf[0])*aCountBuf.GetNum());
771  }
772  // copy class map
773  m_tMap.Reset(nThread, m_nVocabSize);
774  for (int t = 0; t < nThread; t++) {
775  memcpy(m_tMap[t].GetBuf(), m_mMap.GetBuffer(), sizeof(m_mMap[0])*m_mMap.GetNum());
776  }
777  }
778  void WordCluster_t::MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut /* = true */)
779  {
780  if (m_word_count.Find(nWord) == NULL)
781  return; // no such word in the count
782 
783  int nClass = vMap[nWord];
784  int sig = (bOut) ? -1 : 1;
785  int tid = omp_get_thread_num();
786  int *pCount;
788  pCount = m_word_count.Find(nWord);
789  CountAdd(vCountBuf, m_class, nClass, sig *(*pCount));
790 
792  int g[10];
793  int w[10];
794 
795  g[1] = nClass;
796  Trie<int, int> *pSub = m_class_word_gram.FindTrie(&nWord, 1);
797  if (pSub) { //遍历所有可能的前继class
798  TrieIter<int, int> iter(pSub);
799  Trie<int, int> *p;
800  while (p = iter.Next(g[0])) {
801  int count = vCountBuf[*p->GetData()];
802  CountAdd(vCountBuf, m_class_gram, g, 2, sig*count);
803  }
804  }
805 
806  g[0] = nClass;
807  pSub = m_word_class_gram.FindTrie(&nWord, 1);
808  if (pSub) { //遍历所有可能的后继class
809  TrieIter<int, int> iter(pSub);
810  Trie<int, int> *p;
811  while (p = iter.Next(g[1])) {
812  int count = vCountBuf[*p->GetData()];
813  CountAdd(vCountBuf, m_class_gram, g, 2, sig*count);
814  }
815  }
816 
817  g[0] = nClass;
818  g[1] = nClass;
819  w[0] = nWord;
820  w[1] = nWord;
821  pCount = m_wgram_count.Find(w, 2);
822  if (pCount) {
823  CountAdd(vCountBuf, m_class_gram, g, 2, *pCount); //加上count
824  }
825 
826  // word class pair
827 
828  int v;
829  g[1] = nClass;
830 
831  //遍历nWord的前继 pSub = m_inv_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; pCount = p->GetData(); CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount)); } } //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
832  pSub = m_inv_wgram_count.FindTrie(&nWord, 1);
833  if (pSub) {
834  TrieIter<int, int> iter(pSub);
835  Trie<int, int> *p;
836  while (p = iter.Next(v)) {
837  g[0] = v;
838  g[1] = nClass;
839  pCount = p->GetData();
840  CountAdd(vCountBuf, m_word_class_gram, g, 2, sig*(*pCount));
841  }
842  }
843  //遍历nWord后继 pSub = m_wgram_count.FindTrie(&nWord, 1); if (pSub) { TrieIter<int, int> iter(pSub); Trie<int, int> *p; while (p = iter.Next(v)) { g[0] = v; g[1] = nClass; ///< the inverse of class-word pairs pCount = p->GetData(); CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount)); } } } void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass) { if (nToClass == vMap[nWord]) return; MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass vMap[nWord] = nToClass; MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass } void WordCluster_t::Cluster(int nMaxTime /* = -1 */) { int nThread = omp_get_max_threads(); Array<int> aWordPreThread(nThread); Array<int> aOptClassPreThread(nThread); // get a observed word list Array<int> aWordList; for (int w = 0; w < m_nVocabSize; w++) { if (m_word_count.Find(w)) { aWordList.Add(w); } } lout << "[Cluster] max-thread = " << nThread << endl; lout << "[Cluster] observed word = " << aWordList.GetNum() << endl; lout << "[Cluster] Begin..." << endl; double dPreValue = -1e22; for (int t = 0; t < nMaxTime; t++) { bool bChange = false; for (int block = 0; block < aWordList.GetNum() / nThread; block++) { // assign the count to each thread CopyCountToThreads(m_mCountBuf); #pragma omp parallel for for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) { int w = aWordList[i]; VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()]; VecShell<int> vMap = m_tMap[omp_get_thread_num()]; int nOptClass = -1; double dOptValue = -1e22; //对数似然值 for (int c = 0; c < m_nClassNum; c++) //转移到每个class { ExchangeWord(vCountBuf, vMap, w, c); double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值 // cluster.ExchangeWord(w, c); // double dCurValue_test = cluster.LogLikelihood(); // if (fabs(dCurValue - dCurValue_test) > 1e-5) { // lout_variable(dCurValue_test); // lout_variable(dCurValue); // lout_error("Error!"); // } // if (w == 15) { // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl; // Pause(); // } if (dCurValue > dOptValue) { dOptValue = dCurValue; nOptClass = c; } } //lout_variable(dOptValue); aWordPreThread[omp_get_thread_num()] = w; aOptClassPreThread[omp_get_thread_num()] = nOptClass; } // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
844  pSub = m_wgram_count.FindTrie(&nWord, 1);
845  if (pSub) {
846  TrieIter<int, int> iter(pSub);
847  Trie<int, int> *p;
848  while (p = iter.Next(v)) {
849  g[0] = v;
850  g[1] = nClass;
851  pCount = p->GetData();
852  CountAdd(vCountBuf, m_class_word_gram, g, 2, sig*(*pCount));
853  }
854  }
855  }
856  void WordCluster_t::ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass)
857  {
858  if (nToClass == vMap[nWord])
859  return;
860 
861  MoveWord(vCountBuf, vMap, nWord, true); // move out from nClass
862  vMap[nWord] = nToClass;
863  MoveWord(vCountBuf, vMap, nWord, false); // move into nToClass
864  }
865  void WordCluster_t::Cluster(int nMaxTime /* = -1 */)
866  {
867  int nThread = omp_get_max_threads();
868  Array<int> aWordPreThread(nThread);
869  Array<int> aOptClassPreThread(nThread);
870 
871  // get a observed word list
872  Array<int> aWordList;
873  for (int w = 0; w < m_nVocabSize; w++) {
874  if (m_word_count.Find(w)) {
875  aWordList.Add(w);
876  }
877  }
878  lout << "[Cluster] max-thread = " << nThread << endl;
879  lout << "[Cluster] observed word = " << aWordList.GetNum() << endl;
880  lout << "[Cluster] Begin..." << endl;
881 
882  double dPreValue = -1e22;
883 
884 
885  for (int t = 0; t < nMaxTime; t++) {
886  bool bChange = false;
887  for (int block = 0; block < aWordList.GetNum() / nThread; block++) {
888  // assign the count to each thread
889  CopyCountToThreads(m_mCountBuf);
890 #pragma omp parallel for
891  for (int i = block*nThread; i < min(aWordList.GetNum(), (block + 1)*nThread); i++) {
892  int w = aWordList[i];
893 
894  VecShell<int> vCountBuf = m_tCountBuf[omp_get_thread_num()];
895  VecShell<int> vMap = m_tMap[omp_get_thread_num()];
896 
897 
898  int nOptClass = -1;
899  double dOptValue = -1e22; //对数似然值
900 
901  for (int c = 0; c < m_nClassNum; c++) //转移到每个class
902  {
903  ExchangeWord(vCountBuf, vMap, w, c);
904  double dCurValue = LogLikelihood(vCountBuf); //替换后的负对数似然值
905 
906 // cluster.ExchangeWord(w, c);
907 // double dCurValue_test = cluster.LogLikelihood();
908 // if (fabs(dCurValue - dCurValue_test) > 1e-5) {
909 // lout_variable(dCurValue_test);
910 // lout_variable(dCurValue);
911 // lout_error("Error!");
912 // }
913 
914 // if (w == 15) {
915 // lout << "w=" << w << " c=" << c << " d1=" << dCurValue << " d2=" << dCurValue_test << endl;
916 // Pause();
917 // }
918 
919  if (dCurValue > dOptValue) {
920  dOptValue = dCurValue;
921  nOptClass = c;
922  }
923  }
924  //lout_variable(dOptValue);
925  aWordPreThread[omp_get_thread_num()] = w;
926  aOptClassPreThread[omp_get_thread_num()] = nOptClass;
927  }
928 
929  // 汇总 VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum()); VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum()); for (int i = 0; i < nThread; i++) { int w = aWordPreThread[i]; int c_old = main_map[w]; int c_new = aOptClassPreThread[i]; ExchangeWord(main_buf, main_map, w, c_new); double dCurValue = LogLikelihood(main_buf); // cluster.ExchangeWord(w, c_new); // double dCutValue_test = cluster.LogLikelihood(); // lout_variable(dCurValue); // lout_variable(dCutValue_test); if (c_old != c_new) { lout << "[exchange " << t << "] w=" << w << " from class_" << c_old << " to class_" << c_new << " LL=" << dCurValue << endl; bChange = true; } } } /* write res */ WriteRes(m_pathRes); /* count the number at each class */ Vec<int> aClassContent(m_nClassNum); aClassContent.Fill(0); for (int w = 0; w < m_nVocabSize; w++) { aClassContent[m_mMap[w]]++; } lout << "[exchange " << t << " end] "; for (int c = 0; c < m_nClassNum; c++) { lout << c << "[" << aClassContent[c] << "] "; } lout << endl; if (bChange == false) { lout << "unchange..." << endl; break; } } lout << "[Cluster] End" << endl; } double WordCluster_t::LogLikelihood(VecShell<int> vCountBuf) { double dSumClassGram = 0; double dSumClass = 0; double dSumWord = 0; int keys[10]; Trie<int, int> *psub; TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2); while (psub = trie_iter2.Next()) { int count = vCountBuf[*psub->GetData()]; if (count > 0) dSumClassGram += 1.0 * count / m_nSentNum * log((double)count); } // for (int i = 0; i < m_nClassNum + 1; i++) { // for (int j = 0; j < m_nClassNum + 1; j++) { // int n = m_pClassGramCount[i][j]; // if (n < 0) { // lout_error("classGramCount (" << n << ") < 0") // } // if (n != 0) { // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n); // } // } // } //Sum { N(g)logN(g) } int c, w; int *pIdx; LHashIter<int, int> iterC(&m_class); while (pIdx = iterC.Next(c)) { int count = vCountBuf[*pIdx]; if (count>0) dSumClass += 1.0 * count / m_nSentNum * log((double)count); } double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum; return dRes; } void WordCluster_t::SimpleCluster() { // 排序 Heap<int, int> heap(HEAPMODE_MAXHEAP); for (int w = 0; w < m_nVocabSize; w++) { int *p = m_word_count.Find(w); int nTemp = 0;; if (p == NULL) nTemp = 0; else nTemp = (int)sqrt((double)(*p)); ///< 对词频计算平方根 heap.Insert(w, nTemp); } int n = -1; int w, count, preCount = -1; while (heap.OutTop(w, count)) { //确定当前的class if (count != preCount) { preCount = count; n++; } if (n >= m_nClassNum) m_mMap[w] = m_nClassNum - 1; else m_mMap[w] = n; } } }
930  VecShell<int> main_buf(m_mCountBuf.GetBuffer(), m_mCountBuf.GetNum());
931  VecShell<int> main_map(m_mMap.GetBuffer(), m_mMap.GetNum());
932  for (int i = 0; i < nThread; i++) {
933 
934  int w = aWordPreThread[i];
935  int c_old = main_map[w];
936  int c_new = aOptClassPreThread[i];
937 
938  ExchangeWord(main_buf, main_map, w, c_new);
939  double dCurValue = LogLikelihood(main_buf);
940 
941 // cluster.ExchangeWord(w, c_new);
942 // double dCutValue_test = cluster.LogLikelihood();
943 // lout_variable(dCurValue);
944 // lout_variable(dCutValue_test);
945 
946 
947  if (c_old != c_new) {
948  lout << "[exchange " << t << "] w=" << w
949  << " from class_" << c_old
950  << " to class_" << c_new
951  << " LL=" << dCurValue << endl;
952  bChange = true;
953  }
954  }
955  }
956 
957  /* write res */
958  WriteRes(m_pathRes);
959 
960  /* count the number at each class */
961  Vec<int> aClassContent(m_nClassNum);
962  aClassContent.Fill(0);
963  for (int w = 0; w < m_nVocabSize; w++) {
964  aClassContent[m_mMap[w]]++;
965  }
966  lout << "[exchange " << t << " end] ";
967  for (int c = 0; c < m_nClassNum; c++) {
968  lout << c << "[" << aClassContent[c] << "] ";
969  }
970  lout << endl;
971 
972  if (bChange == false) {
973  lout << "unchange..." << endl;
974  break;
975  }
976  }
977 
978  lout << "[Cluster] End" << endl;
979 
980  }
982  {
983  double dSumClassGram = 0;
984  double dSumClass = 0;
985  double dSumWord = 0;
986 
987  int keys[10];
988  Trie<int, int> *psub;
989  TrieIter2<int, int> trie_iter2(&m_class_gram, keys, 2);
990  while (psub = trie_iter2.Next()) {
991  int count = vCountBuf[*psub->GetData()];
992  if (count > 0)
993  dSumClassGram += 1.0 * count / m_nSentNum * log((double)count);
994  }
995 
996 // for (int i = 0; i < m_nClassNum + 1; i++) {
997 // for (int j = 0; j < m_nClassNum + 1; j++) {
998 // int n = m_pClassGramCount[i][j];
999 // if (n < 0) {
1000 // lout_error("classGramCount (" << n << ") < 0")
1001 // }
1002 // if (n != 0) {
1003 // dSumClassGram += 1.0 * n / m_nSentNum * log((double)n);
1004 // }
1005 // }
1006 // }
1007 
1008 
1009 
1010  //Sum { N(g)logN(g) }
1011  int c, w;
1012  int *pIdx;
1013  LHashIter<int, int> iterC(&m_class);
1014  while (pIdx = iterC.Next(c)) {
1015  int count = vCountBuf[*pIdx];
1016  if (count>0)
1017  dSumClass += 1.0 * count / m_nSentNum * log((double)count);
1018  }
1019 
1020  double dRes = dSumClassGram - 2 * dSumClass + m_dWordLogSum;
1021  return dRes;
1022  }
1023 
1025  {
1026  // 排序
1028  for (int w = 0; w < m_nVocabSize; w++) {
1029  int *p = m_word_count.Find(w);
1030  int nTemp = 0;;
1031  if (p == NULL)
1032  nTemp = 0;
1033  else
1034  nTemp = (int)sqrt((double)(*p));
1035  heap.Insert(w, nTemp);
1036  }
1037 
1038 
1039  int n = -1;
1040  int w, count, preCount = -1;
1041  while (heap.OutTop(w, count)) {
1042 
1043  //确定当前的class
1044  if (count != preCount) {
1045  preCount = count;
1046  n++;
1047  }
1048 
1049 
1050  if (n >= m_nClassNum)
1051  m_mMap[w] = m_nClassNum - 1;
1052  else
1053  m_mMap[w] = n;
1054 
1055  }
1056  }
1057 }
_wb_TRIE * Next()
Get next trie.
Definition: wb-trie.h:268
void Cluster(int nMaxTime=-1)
#define SAFE_DELETE(p)
memory release
Definition: wb-vector.h:49
DataT * Next(KeyT &key)
get next value
Definition: wb-lhash.h:576
void CopyCountToThreads(Array< int > &aCountBuf)
T & Get(int i)
get the value at position i
Definition: wb-vector.h:99
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
void Reverse(int *pGram)
bool IsDataLegal()
detect if current trie have legal value
Definition: wb-trie.h:130
#define lout_error(x)
Definition: wb-log.h:183
#define lout_assert(p)
Definition: wb-log.h:185
void Read_TagVocab(const char *path)
DataT * Find(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Find a value.
Definition: wb-trie.h:132
void WriteRes_WordClass(const char *path)
Get all the values whose indexes are of a fixed length. The returned tries may not contain a legal va...
Definition: wb-trie.h:42
void InitCount(const char *path, const char *pTagVocab=NULL)
void InitCount(const char *path, const char *path_init_res=NULL)
void WriteRes_TagVocab(const char *path)
void ExchangeWord(VecShell< int > vCountBuf, VecShell< int > vMap, int nWord, int nToClass)
exchange the nWord form m_aClass[nWord] to nToClass
Array< int > aWords(omp_get_max_threads())
void WriteRes(const char *path)
T * GetBuffer(int i=0) const
get the buffer pointer
Definition: wb-vector.h:97
#define lout_variable(x)
Definition: wb-log.h:179
void MoveWord(int nWord, bool bOut=true)
void WriteCount(LHash< int, int > &count, File &file)
void Insert(TValue p_value, TWeight p_w)
insert a value
Definition: wb-heap.h:157
virtual void Print(const char *p_pMessage,...)
print
Definition: wb-file.cpp:115
iter all the sub-tries
Definition: wb-trie.h:41
int GetNum() const
Get the unit number.
Definition: wb-lhash.h:235
int m_nSentNum
文本中的词总数
DataT * Insert(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Insert a value.
Definition: wb-trie.h:142
Definition: wb-mat.h:29
_wb_TRIE * FindTrie(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Find a sub-trie.
Definition: wb-trie.h:148
void Insert(T t)
insert a value. Avoid repeating
Definition: wb-vector.h:298
the iter of LHash
Definition: wb-lhash.h:42
void Fill(T v)
Definition: wb-mat.h:279
file class.
Definition: wb-file.h:94
Array< int > m_aClass
记录每个词w所在的类g
void ReadRes(const char *path)
#define HEAPMODE_MAXHEAP
max heap, used to sort from large to small
Definition: wb-heap.h:55
void SetNum(int n)
Set Array number, to melloc enough memory.
Definition: wb-vector.h:238
virtual char * GetLine(bool bPrecent=false)
Read a line into the buffer.
Definition: wb-file.cpp:47
void Progress(long long n=-1, bool bInit=false, long long total=100, const char *head="")
progress bar
Definition: wb-log.cpp:146
_wb_TRIE * Next(KeyT &key)
Get next sub-trie.
Definition: wb-trie.h:225
void MoveWord(VecShell< int > vCountBuf, VecShell< int > vMap, int nWord, bool bOut=true)
move word in/out of a class and update the counts
DataT * Insert(KeyT key, bool &bFound)
Insert a value.
Definition: wb-lhash.h:408
void Clean()
Clean the array. Just set the top of array to -1 and donot release the memory.
Definition: wb-vector.h:258
LHash< int, int > m_wordCount
N(w)
Trie< int, int > m_wordGramCount
N(w,v)
double LogLikelihood(VecShell< int > vCountBuf)
claculate the Loglikelihood
int GetNum() const
Get Array number.
Definition: wb-vector.h:240
void Add(T t)
Add a value to the tail of array.
Definition: wb-vector.h:242
void CountAdd(Array< int > &aCountBuf, LHash< int, int > &hash, int key, int count)
Log lout
the defination is in wb-log.cpp
Definition: wb-log.cpp:22
DataT * GetData()
Get value.
Definition: wb-trie.h:126
Trie< int, int > m_wordClassCount
N(w,g), 储存时,w在前,g在后
void Fill(T m)
set all the values to m
Definition: wb-vector.h:139
Trie< int, int > m_invWordGram
储存每个w的前继,不计数,仅用于索引每个w的前继v
void WriteRes_ClassWord(const char *path)
LHash< int, int > m_classCount
N(g)
int m_nVocabSize
word-id的个数
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
void ExchangeWord(int nWord, int nToClass)
exchange the nWord form m_aClass[nWord] to nToClass
Trie< int, int > m_classWordCount
N(g,w), 储存时,w在前,g在后
bool OutTop(TValue &p_value, TWeight &p_w)
out the top
Definition: wb-heap.h:184
heap
Definition: wb-heap.h:75
void Fill(DataT d)
set all the values to d
Definition: wb-trie.h:115
void UpdateCount(Array< int > &aCountBuf)
int nWord
Definition: main-TRF.cpp:173
void CountAdd(LHash< int, int > &count, int nWord, int nAdd)
double m_dWordLogSum
记录sum{N(w)logN(w)} ,因为仅仅需要计算一次
void Fill(DataT d)
Set all the values to d.
Definition: wb-lhash.h:322
define all the code written by Bin Wang.
Definition: wb-file.cpp:21
void Cluster(int nMaxTime=-1)
cluster
int ** m_pClassGramCount
N(g_w,g_v);.
DataT * Find(KeyT key, bool &bFound)
Find a value.
Definition: wb-lhash.h:392