TRF Language Model
trf-vocab.cpp
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 
18 #include "trf-vocab.h"
19 
20 namespace trf
21 {
23  {
24  m_aWords.Clean();
25  // m_aWords[VocabID_seqbeg] = Word_beg;
26  // m_aWords[VocabID_seqend] = Word_end;
27  }
28  Vocab::Vocab(const char* pathVocab)
29  {
30  int nNum = 0;
31  int nClassNum = 0;
32 
33  File file(pathVocab, "rt");
34  char *pLine;
35  while (pLine = file.GetLine())
36  {
37  VocabID id = -1;
38  char *pStr = NULL; // store the word string
39  char *pClass = NULL; // store the class infor
40 
41  char *p = strtok(pLine, " \t\n");
42  if (!p) {
43  lout_warning("[Vocab] Empty Line! (nLine=" << file.nLine << ")");
44  continue;
45  }
46 
47  if (strcmp(p, Word_beg) == 0) {
48  lout_error("[Vocab] the input vocab exists <s>! path=" << pathVocab);
49  }
50  else if (strcmp(p, Word_end) == 0) {
51  lout_error("[Vocab] the input vocab exists </s>! path=" << pathVocab);
52  }
53  else {
54  id = atoi(p);
55  pStr = strtok(NULL, " \t\n");
56  if (String(pStr, strlen("class=")) == "class=") {
57  pClass = pStr;
58  pStr = NULL;
59  }
60  else {
61  pClass = strtok(NULL, " \t\n");
62  }
63  }
64 
65  if (id != nNum) {
66  lout_error("[Vocab] The id is not continuous (id=" << id << ")(nNum=" << nNum << ")!");
67  }
68  m_aWords[id] = (pStr) ? pStr : "NAN";
69  m_aWordID[id] = id;
70 
71 
72  // get the class
73  if (pClass) {
74  pClass += strlen("class=");
75  /* read the class information */
76  m_aClass[id] = atoi(pClass);
77  /* count the class number */
78  nClassNum = max(nClassNum, m_aClass[id] + 1);
79  }
80 
81  nNum++;
82  }
83 
84  // get the class to words
85  m_aClass2Word.SetNum(nClassNum);
86  m_aClass2Word.Fill(NULL);
87  for (int wid = 0; wid < m_aClass.GetNum(); wid++) {
88  VocabID cid = m_aClass[wid];
89  if (!m_aClass2Word[cid]) {
90  m_aClass2Word[cid] = new Array<int>;
91  }
92  m_aClass2Word[cid]->Add(wid);
93  }
94  for (int cid = 0; cid < m_aClass2Word.GetNum(); cid++) {
95  if (m_aClass2Word[cid] == NULL) {
96  lout_error("[Vocab] class " << cid << " is empty!");
97  }
98  }
99 
100 
101  lout << "[Vocab] Read from " << pathVocab << endl;
102  lout << "[Vocab] Read " << nNum << " words" << endl;
103  lout << "[Vocab] Class = " << m_aClass2Word.GetNum() << endl;
104 // for (int cid = 0; cid < m_aClass2Word.GetNum(); cid++) {
105 // lout << "[Vocab] cid=" << cid << "\t";
106 // lout.output(m_aClass2Word[cid]->GetBuffer(), m_aClass2Word[cid]->GetNum()) << endl;
107 // }
108  }
109 
111  {
112  for (int i = 0; i < m_aClass2Word.GetNum(); i++) {
114  }
116  }
117 
118  void Vocab::GetClass(VocabID *pcid, const VocabID *pwid, int nlen)
119  {
120  for (int i = 0; i < nlen; i++) {
121  pcid[i] = GetClass(pwid[i]);
122  }
123  }
124 }
#define SAFE_DELETE(p)
memory release
Definition: wb-vector.h:49
a dynamic string class
Definition: wb-string.h:53
int VocabID
Definition: trf-vocab.h:23
#define lout_error(x)
Definition: wb-log.h:183
Array< VocabID > m_aClass
store the classes of each word. Support soft and hard class
Definition: trf-vocab.h:39
Array< String > m_aWords
the string of each vocabulary id
Definition: trf-vocab.h:38
VocabID GetClass(VocabID wid)
get class
Definition: trf-vocab.h:62
file class.
Definition: wb-file.h:94
void SetNum(int n)
Set Array number, to melloc enough memory.
Definition: wb-vector.h:238
virtual char * GetLine(bool bPrecent=false)
Read a line into the buffer.
Definition: wb-file.cpp:47
#define lout_warning(x)
Definition: wb-log.h:184
void Clean()
Clean the array. Just set the top of array to -1 and donot release the memory.
Definition: wb-vector.h:258
int GetNum() const
Get Array number.
Definition: wb-vector.h:240
void Add(T t)
Add a value to the tail of array.
Definition: wb-vector.h:242
Log lout
the defination is in wb-log.cpp
Definition: wb-log.cpp:22
void Fill(T m)
set all the values to m
Definition: wb-vector.h:139
int nLine
the number of reading from file
Definition: wb-file.h:98
Definition: trf-alg.cpp:20
Array< Array< VocabID > * > m_aClass2Word
store the word belonging to each class.
Definition: trf-vocab.h:40
Array< VocabID > m_aWordID
the word id. i.e 0,1,2,3,...
Definition: trf-vocab.h:37