TRF Language Model
trf-feature.h
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 
18 #pragma once
19 #include "trf-vocab.h"
20 #include <omp.h>
21 
22 namespace trf
23 {
24  const int c_nMaxOrder = 100;
25 
26  class Seq;
27  class FeatStyle;
28  class FeatTable;
29  class Feat;
30 
31 #define word_layer 0
32 #define class_layer 1
33 
41  class Seq
42  {
43  public:
45  int nLen;
46  int nMaxLen;
47  public:
48  Seq() :nLen(0),nMaxLen(0) {}
49  Seq(int len) :nLen(0),nMaxLen(0) { Reset(len); }
51  void Reset(int p_len) {
52  if (p_len > nMaxLen) {
53  Mat<VocabID> newx(2, p_len);
54  newx.Fill(0);
55  for (int i = 0; i < nMaxLen; i++) {
56  newx[0][i] = x[0][i];
57  newx[1][i] = x[1][i];
58  }
59  x.Copy(newx);
60  nMaxLen = p_len;
61  }
62 
63  nLen = p_len;
64  }
66  void Copy(Seq &seq) {
67  x.Copy(seq.x);
68  nLen = seq.nLen;
69  nMaxLen = seq.nMaxLen;
70  }
71  int GetLen() const { return nLen; }
73  void Set(Array<int> &aInt, Vocab *pv);
74  void Set(int *pInt, int nLen, Vocab *pv);
76  void Random(Vocab *pv);
78  void SetClass(Vocab *pv);
80  VocabID *GetWordSeq() { return x[word_layer].GetBuf(); }
82  VocabID* GetClassSeq() { return x[class_layer].GetBuf(); }
83  void Print();
84  void Print(File &file);
85  };
86 
91  class FeatStyle
92  {
93  public:
94  typedef struct {
95  char c;
96  short i;
97  short n;
98  } Field;
99  public:
100  int m_nOrder;
102  public:
103  FeatStyle() :m_nOrder(0) {}
104  FeatStyle(const char *pstyle) { Set(pstyle); }
106  void Set(const char *pstyle);
108  bool GetKey(Seq &seq, int nPos, int nOrder, Array<int> &key);
109  };
124  class FeatTable
125  {
126  public:
130  int m_nNum;
133 
135 
137 
138  public:
140  FeatTable(const char *pstyle = "");
142  ~FeatTable();
144  void Reset(const char *pstyle);
146  int GetNum() const { return m_nNum; }
148  int GetMinOrder() const { return m_nMinOrder; }
150  int GetMaxOrder() const { return m_nMaxOrder; }
152  const char *GetStyle() const { return m_style.GetBuffer(); }
154  void LoadFeat(Seq &seq);
156  void IndexFeat(int begIdx);
158  int CutoffFeat();
160  void Find(Array<int> &afeat, Array<int> &key, bool bBeg, bool bEnd);
162  void Find(Array<int> &afeat, Seq &seq, int pos, int order);
164  void Find(Array<int> &afeat, Seq &seq);
166  void FindPosDep(Array<int> &afeat, Seq &seq, int pos);
167 
169  void ReadT(File &file, PValue *pValue = NULL);
171  void WriteT(File &file, PValue *pValue = NULL);
172 // /// Read from binary
173 // void ReadB(File &file);
174 // /// Write to binary
175 // void WriteB(File &file);
176  };
177 
179  class Feat
180  {
181  public:
184  public:
185  Feat(int nOrder = 0, bool bClass = true)
186  {
187  m_nTotalNum = 0;
188  if (nOrder > 0) {
189  Reset(nOrder, bClass);
190  }
191  }
192  ~Feat() { SAFE_DEL_POINTER_ARRAY(m_aTable); }
194  void Reset(int nOrder, bool bClass);
196  void Reset(const char *pfeatType);
198  int GetMaxOrder();
200  int GetNum() const { return m_nTotalNum; }
202  void Find(Array<int> &afeat, Seq &seq, int pos, int order);
204  void Find(Array<int> &afeat, Seq &seq);
206  void FindClass(Array<int> &afeat, Seq &seq, int pos, int order);
208  void FindWord(Array<int> &afeat, Seq &seq, int pos, int order);
210  void FindPosDep(Array<int> &afeat, Seq &seq, int pos, int type = 0);
212  void LoadFeatFromCorpus(const char *path, Vocab *pv);
214  void WriteT(File &file, PValue *pValue = NULL);
216  void ReadT(File &file, PValue *pValue = NULL);
217  };
219 }
char c
the charactor "w" or "c"
Definition: trf-feature.h:95
void Copy(Seq &seq)
copy the sequence
Definition: trf-feature.h:66
String m_style
using a string array to store the feature styles in this table, such as w2, w3,...
Definition: trf-feature.h:127
a dynamic string class
Definition: wb-string.h:53
m WriteT(cfg_pathModelWrite)
int GetNum() const
get number
Definition: trf-feature.h:146
void SetClass(Vocab *pv)
set the class based the word sequence
Definition: trf-feature.cpp:35
void Random(Vocab *pv)
Random.
Definition: trf-feature.cpp:39
int VocabID
Definition: trf-vocab.h:23
void Fill(T v)
Definition: wb-mat.h:397
Analyse a determinate feat style (without ":")
Definition: trf-feature.h:91
Trie< VocabID, int > * m_ptrie
index all the features.
Definition: trf-feature.h:132
short n
number, "w[2]" then n=2
Definition: trf-feature.h:97
double PValue
Definition: trf-def.h:26
define the feature style. such as "w3"(word-3gram); "c2"(class-2gram);
Definition: trf-feature.h:124
int nLen
Definition: trf-feature.h:45
int m_nMaxOrder
the ngram maximum order, including the skip distance
Definition: trf-feature.h:129
void Reset(int p_len)
reset only change the len variable, does not change the buffer size.
Definition: trf-feature.h:51
int m_nOrder
the total order of this style, including the skip distance
Definition: trf-feature.h:100
Seq(int len)
Definition: trf-feature.h:49
int GetLen() const
Definition: trf-feature.h:71
Array< int > m_aCutoff
cutoff setting for different order
Definition: trf-feature.h:131
define a sequence including the word sequence and class sequence
Definition: trf-feature.h:41
void Print()
Definition: trf-feature.cpp:52
short i
if "w" then i=0, if "c" then i=1, used to index the value in Seq
Definition: trf-feature.h:96
int m_nTotalNum
total feature number
Definition: trf-feature.h:183
int GetNum() const
Get number.
Definition: trf-feature.h:200
Array< FeatTable * > m_aTable
different feature table
Definition: trf-feature.h:182
int m_nNum
the ngram number
Definition: trf-feature.h:130
FeatStyle(const char *pstyle)
Definition: trf-feature.h:104
int GetMinOrder() const
get minimum order
Definition: trf-feature.h:148
file class.
Definition: wb-file.h:94
void Set(Array< int > &aInt, Vocab *pv)
transform the word sequence (form file) to Seq
Definition: trf-feature.cpp:22
#define word_layer
Definition: trf-feature.h:31
Mat< VocabID > x
Definition: trf-feature.h:44
const int c_nMaxOrder
Definition: trf-feature.h:24
const char * GetStyle() const
get style string
Definition: trf-feature.h:152
VocabID * GetClassSeq()
get class sequence
Definition: trf-feature.h:82
VocabID * GetWordSeq()
get word sequence
Definition: trf-feature.h:80
char * GetBuffer() const
get buffer
Definition: wb-string.h:74
int nMaxLen
to denote the buffer size
Definition: trf-feature.h:46
void Copy(MatShell< T > &m)
Definition: wb-mat.h:475
Array< Array< int > * > m_aKey
define the key for each thread
Definition: trf-feature.h:136
include all the feature table
Definition: trf-feature.h:179
int m_nMinOrder
the ngram minimum order, including the skip distance
Definition: trf-feature.h:128
Array< Field > m_aFields
each field
Definition: trf-feature.h:101
int GetMaxOrder() const
get maxmum order
Definition: trf-feature.h:150
Definition: trf-alg.cpp:20
T * GetBuf() const
Definition: wb-mat.h:124
#define SAFE_DEL_POINTER_ARRAY(a)
Definition: wb-vector.h:52
#define class_layer
Definition: trf-feature.h:32
Array< FeatStyle * > m_aStyleInfo
Definition: trf-feature.h:134
Feat(int nOrder=0, bool bClass=true)
Definition: trf-feature.h:185