TRF Language Model
trf-corpus.h
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 
18 #pragma once
19 #include "trf-vocab.h"
20 #include "trf-model.h"
21 
22 namespace trf
23 {
28  class CorpusBase
29  {
30  protected:
32 
33  int m_nMinLen;
34  int m_nMaxLen;
35  int m_nNum;
36  public:
37  CorpusBase() : m_nNum(0), m_nMinLen(0), m_nMaxLen(0) {};
38 
40  virtual void Reset(const char *pfilename) = 0;
42  virtual bool GetSeq(int nLine, Array<VocabID> &aSeq) = 0;
44  virtual void GetLenCount(Array<int> &aLenCount) = 0;
45 
47  virtual int GetNum() const { return m_nNum; }
49  virtual int GetMinLen() const { return m_nMinLen; }
51  virtual int GetMaxLen() const { return m_nMaxLen; }
53  const char* GetFileName() const { return m_filename.GetBuffer(); }
54  };
55 
60  class CorpusTxt : public CorpusBase
61  {
62  protected:
64  public:
65  CorpusTxt() {};
66  CorpusTxt(const char *pfilename) { Reset(pfilename); }
67  ~CorpusTxt();
69  virtual void Reset(const char *pfilename);
71  virtual bool GetSeq(int nLine, Array<VocabID> &aSeq);
73  virtual void GetLenCount(Array<int> &aLenCount);
74  };
75 
81  {
82  protected:
84 
86  int m_nCurIdx;
87  public:
88  CorpusRandSelect() :m_pCorpus(NULL) {}
89  CorpusRandSelect(CorpusBase *pCorpus) { Reset(pCorpus); }
91  void Reset(CorpusBase *p);
93  void GetSeq(Array<VocabID> &aSeq);
95  void GetIdx(int *pIdx, int nNum);
97  void RandomIdx(int nNum);
98  };
99 
104  template <class Type>
105  class RandSeq
106  {
107  protected:
109  int m_nCur;
110  public:
111  RandSeq() :m_nCur(0) {}
112  RandSeq(Array<Type> &a) : m_nCur(0) { m_aSeq.Copy(a); }
114  void Random() {
115  int len = m_aSeq.GetNum();
116  for (int i = 0; i < len; i++) {
117  int s = rand() % (len - i);
118  //exchange i+s and i positions
119  int temp = m_aSeq[i + s];
120  m_aSeq[i + s] = m_aSeq[i];
121  m_aSeq[i] = temp;
122  }
123  m_nCur = 0; // reset the current position
124  }
126  void Add(Type t) { m_aSeq.Add(t); }
128  Type Get() {
129  if (m_nCur >= m_aSeq.GetNum()) {
130  Random();
131  }
132  return m_aSeq[m_nCur++];
133  }
134  };
135 }
void Add(Type t)
add a value
Definition: trf-corpus.h:126
CorpusTxt(const char *pfilename)
Definition: trf-corpus.h:66
void Copy(const Array< T > &array)
Copy the array to current array.
Definition: wb-vector.h:260
a dynamic string class
Definition: wb-string.h:53
CorpusBase * m_pCorpus
Definition: trf-corpus.h:83
virtual void Reset(const char *pfilename)=0
Open file and Load the file.
const char * GetFileName() const
get the file name
Definition: trf-corpus.h:53
Array< int > m_aRandIdx
Definition: trf-corpus.h:85
Array< Array< VocabID > * > m_aSeq
Definition: trf-corpus.h:63
virtual bool GetSeq(int nLine, Array< VocabID > &aSeq)=0
get the sequence in nLine
void Random()
Random the sequence.
Definition: trf-corpus.h:114
Type Get()
Get a value.
Definition: trf-corpus.h:128
String m_filename
Definition: trf-corpus.h:31
Array< VocabID > aSeq
Definition: main-TRF.cpp:153
int m_nMaxLen
record the maximum length;
Definition: trf-corpus.h:34
int GetNum() const
Get Array number.
Definition: wb-vector.h:240
void Add(T t)
Add a value to the tail of array.
Definition: wb-vector.h:242
virtual int GetMinLen() const
get the min length
Definition: trf-corpus.h:49
RandSeq(Array< Type > &a)
Definition: trf-corpus.h:112
Array< Type > m_aSeq
Definition: trf-corpus.h:108
virtual void GetLenCount(Array< int > &aLenCount)=0
get the length count
int m_nNum
record the length number;
Definition: trf-corpus.h:35
char * GetBuffer() const
get buffer
Definition: wb-string.h:74
int m_nMinLen
record the minimum length;
Definition: trf-corpus.h:33
Definition: trf-alg.cpp:20
virtual int GetMaxLen() const
get the max length
Definition: trf-corpus.h:51
virtual int GetNum() const
get the seq number
Definition: trf-corpus.h:47
CorpusRandSelect(CorpusBase *pCorpus)
Definition: trf-corpus.h:89