17 #ifndef _WB_WORD_CLUSTER_H_ 18 #define _WB_WORD_CLUSTER_H_ 63 m_pathWordClass = NULL;
64 m_pathClassWord = NULL;
65 m_pathTagVocab = NULL;
71 void Reverse(
int *pGram) {
int n = pGram[0]; pGram[0] = pGram[1]; pGram[1] = n; }
72 void InitCount(
const char *path,
const char *pTagVocab = NULL);
76 int *pCount = count.
Insert(nWord, bFound);
77 if (!bFound) *pCount = nAdd;
82 int *pCount = count.
Insert(pWord, nLen, bFound);
83 if (!bFound) *pCount = nAdd;
86 void CountAdd(
int **pCount,
int *pWord,
int nLen,
int nAdd) {
87 pCount[pWord[0]][pWord[1]] += nAdd;
101 void Cluster(
int nMaxTime = -1);
153 if (pathRes == NULL) {
154 m_pathRes =
"word_cluster.default.res";
162 void WriteRes(
const char *path);
163 void ReadRes(
const char *path);
164 void Reverse(
int *pGram) {
int n = pGram[0]; pGram[0] = pGram[1]; pGram[1] = n; }
165 void InitCount(
const char *path,
const char *path_init_res = NULL);
172 void CopyCountToThreads(
Array<int> &aCountBuf);
178 void Cluster(
int nMaxTime = -1);
void Cluster(int nMaxTime=-1)
void CountAdd(Trie< int, int > &count, int *pWord, int nLen, int nAdd)
void Read_TagVocab(const char *path)
void WriteRes_WordClass(const char *path)
void InitCount(const char *path, const char *pTagVocab=NULL)
WordCluster_t(int nClass, char *pathRes=NULL)
void WriteRes_TagVocab(const char *path)
#define SAFE_NEW_DARRAY(p, Type, n, m)
Trie< int, int > m_class_gram
(g(w), g(v)) the index of the class ngram
LHash< int, int > m_class
index the class
void MoveWord(int nWord, bool bOut=true)
void WriteCount(LHash< int, int > &count, File &file)
Trie< int, int > m_inv_wgram_count
N(v,w) inverse word bigram count.
DataT * Insert(const KeyT *p_pIndex, int nIndexLen, bool &bFound)
Insert a value.
Trie< int, int > m_wgram_count
N(w,v) word bigram count.
Array< int > m_aClass
记录每个词w所在的类g
#define SAFE_DELETE_DARRAY(p, n)
DataT * Insert(KeyT key, bool &bFound)
Insert a value.
String m_pathRes
the result file, [ w g(w) ]
LHash< int, int > m_wordCount
N(w)
Mat< int > m_tCountBuf
the count buffer for each threads
Trie< int, int > m_wordGramCount
N(w,v)
void CountAdd(int **pCount, int *pWord, int nLen, int nAdd)
int m_nSentNum
total sentence number
Array< int > m_mCountBuf
the count buffer in main threads
Trie< int, int > m_wordClassCount
N(w,g), 储存时,w在前,g在后
Trie< int, int > m_invWordGram
储存每个w的前继,不计数,仅用于索引每个w的前继v
int m_nClassNum
the maximum class number
Array< int > m_mMap
the final g(w)
void WriteRes_ClassWord(const char *path)
Trie< int, int > m_word_class_gram
(w,g) the word-class ngram
LHash< int, int > m_classCount
N(g)
int m_nVocabSize
word-id的个数
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
void ExchangeWord(int nWord, int nToClass)
exchange the nWord form m_aClass[nWord] to nToClass
Trie< int, int > m_classWordCount
N(g,w), 储存时,w在前,g在后
Trie< int, int > m_class_word_gram
(g,w) the class-word ngram
int m_nVocabSize
word number, i.e. the maximum word-id + 1
Mat< int > m_tMap
map the word to correspond class at each thread
include all the wb-written modules
void CountAdd(LHash< int, int > &count, int nWord, int nAdd)
double m_dWordLogSum
记录sum{N(w)logN(w)} ,因为仅仅需要计算一次
define all the code written by Bin Wang.
LHash< int, int > m_word_count
N(w) index the word unigram count.
int ** m_pClassGramCount
N(g_w,g_v);.