TRF Language Model
main-word-cluster.cpp
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
31 #include "wb-word-cluster.h"
32 #include <omp.h>
33 using namespace wb;
34 
35 static char *cfg_pathTxt = NULL;
36 // static char *cfg_pathWordClass = NULL;
37 // static char *cfg_pathClassWord = NULL;
38 // static char *cfg_pathTagVocab = NULL;
39 static int cfg_nClassNum = 10;
40 static bool cfg_bSimpleCluster = false;
41 //static char *cfg_pathReadTagVocab = NULL;
42 static int cfg_nThread = -1;
43 
44 static char *cfg_pathReadRes = NULL;
45 static char *cfg_pathWriteRes = NULL;
46 
48 
49 _wbMain
50 {
51  opt.Add(wbOPT_STRING, "txt", &cfg_pathTxt, "input txt(word id, begin from 0)");
52 // opt.Add(wbOPT_STRING, "out-wc", &cfg_pathWordClass, "output the cluster file, [word_id, class_id]");
53 // opt.Add(wbOPT_STRING, "out-cw", &cfg_pathClassWord, "output the cluster file, [class_id, word_id1, word_id2,...]");
54 // opt.Add(wbOPT_STRING, "tag-vocab", &cfg_pathTagVocab, "output a tag-vocab file? [word_id, word_id, class_id]");
55  opt.Add(wbOPT_INT, "num", &cfg_nClassNum, "class num");
56 // opt.Add(wbOPT_STRING, "read-tag-vocab", &cfg_pathReadTagVocab, "read tag-vocab, calculate the likelihood");
57  opt.Add(wbOPT_STRING, "read", &cfg_pathReadRes, "read the res file [word-id, class-id]");
58  opt.Add(wbOPT_STRING, "write", &cfg_pathWriteRes, "write the res file [word-id, class-id]");
59  opt.Add(wbOPT_INT, "thread", &cfg_nThread, "thread number");
60 
61  opt.Add(wbOPT_TRUE, "simple-cluster", &cfg_bSimpleCluster, "just using the count of unigram to perform cluster");
62  opt.Parse(_argc, _argv);
63 
64  if (cfg_nThread > 0)
65  omp_set_num_threads(cfg_nThread);
66  lout << "[OMP] omp_thread = " << omp_get_max_threads() << endl;
67 
68  WordCluster_t cluster(cfg_nClassNum, cfg_pathWriteRes);
69 // cluster.m_pathWordClass = cfg_pathWordClass;
70 // cluster.m_pathClassWord = cfg_pathClassWord;
71 // cluster.m_pathTagVocab = cfg_pathTagVocab;
72 
73  cluster.InitCount(cfg_pathTxt, cfg_pathReadRes);
74 
75 
76  if (cfg_bSimpleCluster) {
77  lout << "Simple Cluster..." << endl;
78  cluster.SimpleCluster();
79  }
80  else {
81  lout << "Cluster..." << endl;
82  cluster.Cluster(100);
83  }
84 
85  if (cfg_pathWriteRes)
86  cluster.WriteRes(cfg_pathWriteRes);
87 
88 // if (cfg_pathWordClass)
89 // cluster.WriteRes_WordClass(cfg_pathWordClass);
90 // if (cfg_pathClassWord)
91 // cluster.WriteRes_ClassWord(cfg_pathClassWord);
92 // if (cfg_pathTagVocab)
93 // cluster.WriteRes_TagVocab(cfg_pathTagVocab);
94 
95  // cluster.m_aClass[0] = 9;
96  // cluster.m_aClass[10] = 0;
97  // cluster.UpdataCount();
98  // cluster.WriteClass(cfg_pathClass);
99  // cluster.WriteCount(cluster.m_wordCount, wbFile("unigram.count", "wt"));
100  // cluster.WriteCount(cluster.m_wordGramCount, wbFile("bigram.count", "wt"));
101  // cluster.WriteCount(cluster.m_classCount, wbFile("classUnigram.count", "wt") );
102  // cluster.WriteCount(cluster.m_classGramCount, wbFile("classBigram.count", "wt") );
103  // cluster.WriteCount(cluster.m_classWordCount, wbFile("classWord.count", "wt"), true );
104  // cluster.WriteCount(cluster.m_wordClassCount, wbFile("wordClass.count", "wt") );
105 
106  // cluster.ExchangeWord(0, 9);
107  // cluster.ExchangeWord(10,0);
108  // cluster.WriteClass("class1.txt");
109  // cluster.WriteCount(cluster.m_wordCount, wbFile("unigram1.count", "wt"));
110  // cluster.WriteCount(cluster.m_wordGramCount, wbFile("bigram1.count", "wt"));
111  // cluster.WriteCount(cluster.m_classCount, wbFile("classUnigram1.count", "wt") );
112  // cluster.WriteCount(cluster.m_classGramCount, wbFile("classBigram1.count", "wt") );
113  // cluster.WriteCount(cluster.m_classWordCount, wbFile("classWord1.count", "wt"), true);
114  // cluster.WriteCount(cluster.m_wordClassCount, wbFile("wordClass1.count", "wt") );
115 
116 
117  return 1;
118 };
void SimpleCluster()
使用出现频率进行简单的分类,不需要迭代
is true if exist
Definition: wb-option.h:33
Option opt
void Parse(const char *plabel, const char *pvalue)
parse a single option, "pvalue" can be NULL
Definition: wb-option.cpp:80
void InitCount(const char *path, const char *path_init_res=NULL)
void WriteRes(const char *path)
integer
Definition: wb-option.h:35
int cfg_nThread
void Add(ValueType t, const char *pLabel, void *pAddress, const char *pDocMsg=NULL)
Add a option.
Definition: wb-option.cpp:35
Log lout
the defination is in wb-log.cpp
Definition: wb-log.cpp:22
define all the code written by Bin Wang.
Definition: wb-file.cpp:21
Get the option from command line or command files.
Definition: wb-option.h:54
void Cluster(int nMaxTime=-1)
cluster