TRF Language Model
wb-word-cluster.h
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 #ifndef _WB_WORD_CLUSTER_H_
18 #define _WB_WORD_CLUSTER_H_
19 
20 #include "wb-system.h"
21 
22 namespace wb
23 {
33  {
34  public:
39  //wbTrie<int, int> m_classGramCount; ///< N(g_w,g_v);
43 
44  double m_dWordLogSum;
45 
49  int m_nSentNum;
50 
53 
57 
58 
59  public:
60  WordCluster(int nClass) : m_nClassNum(nClass){
61  SAFE_NEW_DARRAY(m_pClassGramCount, int, nClass + 1, nClass + 1);
62 
63  m_pathWordClass = NULL;
64  m_pathClassWord = NULL;
65  m_pathTagVocab = NULL;
66  };
67  ~WordCluster(void) {
68  SAFE_DELETE_DARRAY(m_pClassGramCount, m_nClassNum + 1);
69  };
70 
71  void Reverse(int *pGram) { int n = pGram[0]; pGram[0] = pGram[1]; pGram[1] = n; }
72  void InitCount(const char *path, const char *pTagVocab = NULL);
73  void UpdataCount();
74  void CountAdd(LHash<int, int> &count, int nWord, int nAdd) {
75  bool bFound;
76  int *pCount = count.Insert(nWord, bFound);
77  if (!bFound) *pCount = nAdd;
78  else *pCount += nAdd;
79  }
80  void CountAdd(Trie<int, int> &count, int *pWord, int nLen, int nAdd) {
81  bool bFound;
82  int *pCount = count.Insert(pWord, nLen, bFound);
83  if (!bFound) *pCount = nAdd;
84  else *pCount += nAdd;
85  }
86  void CountAdd(int **pCount, int *pWord, int nLen, int nAdd) {
87  pCount[pWord[0]][pWord[1]] += nAdd;
88  }
89  void WriteCount(LHash<int, int> &count, File &file);
90  void WriteCount(Trie<int, int> &count, File &file, bool bReverse = false);
91  void WriteRes_WordClass(const char *path);
92  void WriteRes_ClassWord(const char *path);
93  void WriteRes_TagVocab(const char *path);
94  void Read_TagVocab(const char *path);
95 
96  double LogLikelihood();
97  void MoveWord(int nWord, bool bOut = true);
99  void ExchangeWord(int nWord, int nToClass);
100 
101  void Cluster(int nMaxTime = -1);
102 
104  void SimpleCluster();
105  };
106 
112  {
113  public:
117 
122 
125 
128 
129 
133 
137 
139 
140 // char *m_pathWordClass;
141 // char *m_pathClassWord;
142 // char *m_pathTagVocab;
143 
144  //WordCluster cluster;
145 
146  public:
147  WordCluster_t(int nClass, char *pathRes = NULL):
148  //cluster(nClass),
149  m_nClassNum(nClass){
150 // m_pathWordClass = NULL;
151 // m_pathClassWord = NULL;
152 // m_pathTagVocab = NULL;
153  if (pathRes == NULL) {
154  m_pathRes = "word_cluster.default.res";
155  }
156  else {
157  m_pathRes = pathRes;
158  }
159  };
161  };
162  void WriteRes(const char *path);
163  void ReadRes(const char *path);
164  void Reverse(int *pGram) { int n = pGram[0]; pGram[0] = pGram[1]; pGram[1] = n; }
165  void InitCount(const char *path, const char *path_init_res = NULL);
166  void UpdateCount(Array<int> &aCountBuf);
167  void CountAdd(Array<int> &aCountBuf, LHash<int, int> &hash, int key, int count);
168  void CountAdd(Array<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count);
169  void CountAdd(VecShell<int> &aCountBuf, LHash<int, int> &hash, int key, int count);
170  void CountAdd(VecShell<int> &aCountBuf, Trie<int, int> &hash, int *pKey, int nLen, int count);
171 
172  void CopyCountToThreads(Array<int> &aCountBuf);
174  void MoveWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, bool bOut = true);
176  void ExchangeWord(VecShell<int> vCountBuf, VecShell<int> vMap, int nWord, int nToClass);
178  void Cluster(int nMaxTime = -1);
180  double LogLikelihood(VecShell<int> vCountBuf);
181 
183  void SimpleCluster();
184  };s
186 }
187 
188 #endif
void Cluster(int nMaxTime=-1)
a dynamic string class
Definition: wb-string.h:53
void Reverse(int *pGram)
void CountAdd(Trie< int, int > &count, int *pWord, int nLen, int nAdd)
void Read_TagVocab(const char *path)
void WriteRes_WordClass(const char *path)
void InitCount(const char *path, const char *pTagVocab=NULL)
WordCluster_t(int nClass, char *pathRes=NULL)
void WriteRes_TagVocab(const char *path)
#define SAFE_NEW_DARRAY(p, Type, n, m)
Definition: wb-vector.h:44
Trie< int, int > m_class_gram
(g(w), g(v)) the index of the class ngram
LHash< int, int > m_class
index the class
void MoveWord(int nWord, bool bOut=true)
void WriteCount(LHash< int, int > &count, File &file)
Trie< int, int > m_inv_wgram_count
N(v,w) inverse word bigram count.
int m_nSentNum
文本中的词总数
DataT * Insert(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Insert a value.
Definition: wb-trie.h:142
Trie< int, int > m_wgram_count
N(w,v) word bigram count.
file class.
Definition: wb-file.h:94
Array< int > m_aClass
记录每个词w所在的类g
#define SAFE_DELETE_DARRAY(p, n)
Definition: wb-vector.h:51
DataT * Insert(KeyT key, bool &bFound)
Insert a value.
Definition: wb-lhash.h:408
String m_pathRes
the result file, [ w g(w) ]
LHash< int, int > m_wordCount
N(w)
Mat< int > m_tCountBuf
the count buffer for each threads
Trie< int, int > m_wordGramCount
N(w,v)
void CountAdd(int **pCount, int *pWord, int nLen, int nAdd)
int m_nSentNum
total sentence number
Array< int > m_mCountBuf
the count buffer in main threads
Trie< int, int > m_wordClassCount
N(w,g), 储存时,w在前,g在后
Trie< int, int > m_invWordGram
储存每个w的前继,不计数,仅用于索引每个w的前继v
int m_nClassNum
the maximum class number
Array< int > m_mMap
the final g(w)
void WriteRes_ClassWord(const char *path)
Trie< int, int > m_word_class_gram
(w,g) the word-class ngram
LHash< int, int > m_classCount
N(g)
int m_nVocabSize
word-id的个数
void Reverse(int *pGram)
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
void ExchangeWord(int nWord, int nToClass)
exchange the nWord form m_aClass[nWord] to nToClass
Trie< int, int > m_classWordCount
N(g,w), 储存时,w在前,g在后
Trie< int, int > m_class_word_gram
(g,w) the class-word ngram
int m_nVocabSize
word number, i.e. the maximum word-id + 1
Mat< int > m_tMap
map the word to correspond class at each thread
int nWord
Definition: main-TRF.cpp:173
WordCluster(int nClass)
include all the wb-written modules
void CountAdd(LHash< int, int > &count, int nWord, int nAdd)
double m_dWordLogSum
记录sum{N(w)logN(w)} ,因为仅仅需要计算一次
define all the code written by Bin Wang.
Definition: wb-file.cpp:21
LHash< int, int > m_word_count
N(w) index the word unigram count.
int ** m_pClassGramCount
N(g_w,g_v);.