TRF Language Model
trf-corpus.cpp
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 
18 #include "trf-corpus.h"
19 
20 namespace trf
21 {
23  {
24  // clean
25  for (int i = 0; i < m_aSeq.GetNum(); i++)
26  SAFE_DELETE(m_aSeq[i]);
27  m_aSeq.Clean();
28  }
29  void CorpusTxt::Reset(const char *pfilename)
30  {
31  // clean
32  for (int i = 0; i < m_aSeq.GetNum(); i++)
33  SAFE_DELETE(m_aSeq[i]);
34  m_aSeq.Clean();
35  m_nMinLen = 100;
36  m_nMaxLen = 0;
37 
38  // read
39  File file(pfilename, "rt");
40  char *pLine;
41  while (pLine = file.GetLine()) {
42 
43  Array<VocabID> *pSeq = new Array<VocabID>;
44  char *p = strtok(pLine, " \t\n");
45  while (p) {
46  pSeq->Add(atoi(p));
47  p = strtok(NULL, " \t\n");
48  }
49 
50  m_nMinLen = min(pSeq->GetNum(), m_nMinLen);
51  m_nMaxLen = max(pSeq->GetNum(), m_nMaxLen);
52  m_aSeq.Add(pSeq);
53  }
54 
55  m_nNum = m_aSeq.GetNum();
56  }
58  {
59  if (nLine >= GetNum()) {
60  return false;
61  }
62 
63  aSeq.Copy(*m_aSeq[nLine]);
64  return true;
65  }
67  {
68  aLenCount.SetNum(m_nMaxLen + 1);
69  aLenCount.Fill(0);
70  for (int i = 0; i < GetNum(); i++) {
71  int nLen = m_aSeq[i]->GetNum();
72  aLenCount[nLen]++;
73  }
74  }
75 
76 
77 
78  /************************************************************************/
79  /* class CorpusRandomSelect */
80  /************************************************************************/
82  {
83  m_pCorpus = p;
84  RandomIdx(m_pCorpus->GetNum());
85  }
87  {
88  m_aRandIdx.SetNum(nNum);
89  for (int i = 0; i < nNum; i++) {
90  m_aRandIdx[i] = i;
91  }
92  RandomPos(m_aRandIdx, nNum, nNum);
93 
94  m_nCurIdx = 0;
95  }
96  void CorpusRandSelect::GetIdx(int *pIdx, int nNum)
97  {
98  for (int i = 0; i < nNum; i++) {
99  if (m_nCurIdx >= m_pCorpus->GetNum()) {
100  RandomIdx(m_pCorpus->GetNum());
101  }
102 
103  pIdx[i] = m_aRandIdx[m_nCurIdx];
104  m_nCurIdx++;
105  }
106 
107  }
109  {
110  if (m_nCurIdx >= m_pCorpus->GetNum()) {
111  RandomIdx(m_pCorpus->GetNum());
112  }
113 
114  m_pCorpus->GetSeq(m_aRandIdx[m_nCurIdx], aSeq);
115  m_nCurIdx++;
116  }
117 
118 }
void Copy(const Array< T > &array)
Copy the array to current array.
Definition: wb-vector.h:260
#define SAFE_DELETE(p)
memory release
Definition: wb-vector.h:49
void RandomPos(int *a, int len, int n)
Definition: trf-def.cpp:132
void GetSeq(Array< VocabID > &aSeq)
Get x.
Definition: trf-corpus.cpp:108
virtual void Reset(const char *pfilename)
Open file and Load the file.
Definition: trf-corpus.cpp:29
Array< Array< VocabID > * > m_aSeq
Definition: trf-corpus.h:63
void RandomIdx(int nNum)
Generate the random idx.
Definition: trf-corpus.cpp:86
file class.
Definition: wb-file.h:94
virtual void GetLenCount(Array< int > &aLenCount)
get the length count
Definition: trf-corpus.cpp:66
void SetNum(int n)
Set Array number, to melloc enough memory.
Definition: wb-vector.h:238
virtual char * GetLine(bool bPrecent=false)
Read a line into the buffer.
Definition: wb-file.cpp:47
Array< VocabID > aSeq
Definition: main-TRF.cpp:153
void Clean()
Clean the array. Just set the top of array to -1 and donot release the memory.
Definition: wb-vector.h:258
int m_nMaxLen
record the maximum length;
Definition: trf-corpus.h:34
void Reset(CorpusBase *p)
Reset the class.
Definition: trf-corpus.cpp:81
int GetNum() const
Get Array number.
Definition: wb-vector.h:240
void Add(T t)
Add a value to the tail of array.
Definition: wb-vector.h:242
void GetIdx(int *pIdx, int nNum)
Get Ranodm Index.
Definition: trf-corpus.cpp:96
int m_nNum
record the length number;
Definition: trf-corpus.h:35
virtual bool GetSeq(int nLine, Array< VocabID > &aSeq)
get the sequence in nLine
Definition: trf-corpus.cpp:57
void Fill(T m)
set all the values to m
Definition: wb-vector.h:139
int m_nMinLen
record the minimum length;
Definition: trf-corpus.h:33
Definition: trf-alg.cpp:20
virtual int GetNum() const
get the seq number
Definition: trf-corpus.h:47