TRF Language Model
wb-lhash.h
Go to the documentation of this file.
1 // You may obtain a copy of the License at
2 //
3 // http://www.apache.org/licenses/LICENSE-2.0
4 //
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 //
11 // Copyright 2014-2015 Tsinghua University
12 // Author: wb.th08@gmail.com (Bin Wang), ozj@tsinghua.edu.cn (Zhijian Ou)
13 //
14 // All h, cpp, cc, and script files (e.g. bat, sh, pl, py) should include the above
15 // license declaration. Different coding language may use different comment styles.
16 
17 
25 #ifndef _WB_LHASH_H_
26 #define _WB_LHASH_H_
27 #include <iostream>
28 #include <cstring>
29 #include <cmath>
30 using namespace std;
31 
32 
33 #ifndef Bit2Size
34 #define Bit2Size(bits) ( (int)(1<<(bits)) )
35 #endif
36 
37 
38 namespace wb
39 {
40 
41  template <class KeyT, class DataT> class LHash;
42  template <class KeyT, class DataT> class LHashIter;
43 
44 
45 
47  template <class KeyT>
48  inline KeyT Map_copyKey(KeyT key) { return key; }
50  inline const char *Map_copyKey(const char *key) { return strdup(key); }
51 
52 
54  template <class KeyT>
55  inline void Map_freeKey(KeyT key) {};
57  inline void Map_freeKey(const char *key) { free((void*)key); }
58 
59 
66  const short c_ShortNokeyValue = (short)(1u << (sizeof(short) * 8 - 1));
67  const int c_IntNokeyValue = (int)(1u << (sizeof(int) * 8 - 1));
68  const long c_LongNokeyValue = (long)(1uL << (sizeof(long) * 8 - 1));
69  const long long c_LLongNokeyValue = (long long)(1LL << (sizeof(long long) * 8 - 1));
70  const short unsigned c_UShortNokeyValue = ~(short unsigned)0;
71  const unsigned c_UIntNokeyValue = ~(unsigned)0;
72  const long unsigned c_ULongNokeyValue = ~(long unsigned)0;
73  const float c_floatNokeyValue = 1e15;
74  const double c_doubleNokeyValue = 1e20;
75 
76 
79  template <class KeyT>
81  inline void Map_noKey(KeyT *&key) { key = 0; }
82  inline void Map_noKey(int &key) { key = c_IntNokeyValue; }
83  inline void Map_noKey(short int &key) { key = c_ShortNokeyValue; }
84  inline void Map_noKey(long int &key) { key = c_LongNokeyValue; }
85  inline void Map_noKey(long long &key) { key = c_LLongNokeyValue; }
86  inline void Map_noKey(unsigned &key) { key = c_UIntNokeyValue; }
87  inline void Map_noKey(short unsigned &key) { key = c_UShortNokeyValue; }
88  inline void Map_noKey(long unsigned &key) { key = c_ULongNokeyValue; }
89  inline void Map_noKey(float &key) { key = c_floatNokeyValue; }
90  inline void Map_noKey(double &key) { key = c_doubleNokeyValue; }
92 
96  template <class KeyT>
98  inline bool Map_noKeyP(KeyT *key) { return key == 0; }
99  inline bool Map_noKeyP(int key) { return key == c_IntNokeyValue; }
100  inline bool Map_noKeyP(short int key) { return key == c_ShortNokeyValue; }
101  inline bool Map_noKeyP(long int key) { return key == c_LongNokeyValue; }
102  inline bool Map_noKeyP(long long key) { return key == c_LLongNokeyValue; }
103  inline bool Map_noKeyP(unsigned key) { return key == c_UIntNokeyValue; }
104  inline bool Map_noKeyP(short unsigned key) { return key == c_UShortNokeyValue; }
105  inline bool Map_noKeyP(long unsigned key) { return key == c_ULongNokeyValue; }
106  inline bool Map_noKeyP(float key) { return key == c_floatNokeyValue; }
107  inline bool Map_noKeyP(double &key) { return key == c_doubleNokeyValue; }
109 
113  template <class KeyT>
115  inline bool Map_equalKey(KeyT key1, KeyT key2) { return (key1 == key2); }
116  inline bool Map_equalKey(float key1, float key2) { return fabs(key1 - key2) < 1e-2; }
117  inline bool Map_equalKey(double key1, double key2) { return fabs(key1 - key2) < 1e-15; }
118  inline bool Map_equalKey(const char *pStr1, const char *pStr2) { return strcmp(pStr1, pStr2) == 0; }
120 
121 #define HashMask(nbits) (~((~0L)<<(nbits)))
122 
128  inline unsigned long LHash_hashKey(unsigned long key, unsigned maxBits)
131  {
132  return (((key * 1103515245 + 12345) >> (30 - maxBits)) & HashMask(maxBits));
133  }
134 
136  template <class KeyT>
137  inline unsigned long LHash_hashKey(KeyT *key, unsigned maxBits)
138  {
139  return LHash_hashKey((unsigned long)key, maxBits);
140  }
141 
143  inline unsigned long LHash_hashKey(int key, unsigned maxBits)
144  {
145  return LHash_hashKey((unsigned long)key, maxBits);
146  }
147 
149  inline unsigned long LHash_hashKey(float key, unsigned maxBits)
150  {
151  return LHash_hashKey((unsigned long)(100 * key), maxBits);
152  }
154  inline unsigned long LHash_hashKey(double key, unsigned maxBits)
155  {
156  return LHash_hashKey((unsigned long)(100 * key), maxBits);
157  }
159  /*
160  * The string hash function was borrowed from the Tcl libary, by
161  * John Ousterhout. This is what he wrote:
162  *
163  * I tried a zillion different hash functions and asked many other
164  * people for advice. Many people had their own favorite functions,
165  * all different, but no-one had much idea why they were good ones.
166  * I chose the one below (multiply by 9 and add new character)
167  * because of the following reasons:
168  *
169  * 1. Multiplying by 10 is perfect for keys that are decimal strings,
170  * and multiplying by 9 is just about as good.
171  * 2. Times-9 is (shift-left-3) plus (old). This means that each
172  * character's bits hang around in the low-order bits of the
173  * hash value for ever, plus they spread fairly rapidly up to
174  * the high-order bits to fill out the hash value. This seems
175  * works well both for decimal and non-decimal strings.
176  */
177  inline unsigned long LHash_hashKey(const char *key, unsigned maxBits)
178  {
179  unsigned long i = 0;
180 
181  for (; *key; key++) {
182  i += (i << 3) + *key;
183  }
184  return LHash_hashKey(i, maxBits);
185  }
187 
191  template <class KeyT>
193  inline bool LHash_IncSort(KeyT k1, KeyT k2) { return k1 <= k2; }
194  inline bool LHash_IncSort(const char *p1, const char *p2) { return strcmp(p2, p1) <= 0; }
196 
197  const int cn_MinLHashBits = 3;
198  const int cn_MaxLHashBits = 31;
199  const float cf_HashRatio = 0.8f;
200 
201 
208  template <class KeyT, class DataT>
209  class LHash
210  {
211  friend class LHashIter<KeyT, DataT>;
212  public:
214  typedef struct
215  {
216  KeyT key;
217  DataT value;
218  } Unit;
219  protected:
222  Unit *m_pUnit;
223 
224 #ifdef _DEBUG
225  static int nLocateNum;
226  static double fLocateStep;
227 #endif
228 
229  public:
231  LHash(int nSize = 0) :m_nMaxBits(0), m_pUnit(NULL), m_nUnitNum(0) { Reset(nSize); }
233  ~LHash() { Release(); }
235  int GetNum() const { return m_nUnitNum; }
237  int GetSize() const { return Bit2Size(m_nMaxBits); }
239  int GetMaxBits() const { return m_nMaxBits; }
241  const Unit* GetBuffer() const { return m_pUnit; }
243  size_t TotalMemCost()
244  {
245  return sizeof(*this) + GetSize() * sizeof(Unit);
246  }
248  int RoundSize(int nSize)
249  {
250  if (nSize < Bit2Size(cn_MinLHashBits))
251  return nSize;
252  return (int)((nSize + 1) / cf_HashRatio);
253  }
255  void Alloc(int nSize)
256  {
257  int maxBits, maxUnits;
258 
259  //���㣬��Ҫ����nSize����Ҫ���ڴ�ռ��bits��
260  maxBits = 1; //maxBits��СΪ1
261  while (Bit2Size(maxBits) < nSize) {
262  if (maxBits > cn_MaxLHashBits) {
263  cout << "[LHash] Alloc: needed bits over cn_MaxLashBits" << endl;
264  exit(1);
265  }
266  ++maxBits;
267  }
268 
269  maxUnits = Bit2Size(maxBits);
270  if (m_pUnit) {
271  delete[] m_pUnit;
272  m_pUnit = NULL;
273  }
274  m_pUnit = new Unit[maxUnits];
275 
276  m_nMaxBits = maxBits;
277  m_nUnitNum = 0;
278 
279  //�����е�ֵ������ΪnoKey
280  for (int i = 0; i<maxUnits; i++)
281  Map_noKey(m_pUnit[i].key);
282  }
284  void Release()
285  {
286  if (m_pUnit)
287  {
288  int maxUnits = Bit2Size(m_nMaxBits);
289  for (int i = 0; i<maxUnits; i++){
290  if (!Map_noKeyP(m_pUnit[i].key)) {
291  Map_freeKey(m_pUnit[i].key);
292  }
293  }
294 
295  delete [] m_pUnit;
296  m_pUnit = NULL;
297  }
298  m_nUnitNum = 0;
299  m_nMaxBits = 0;
300  }
302  void Reset(int nSize)
303  {
304  Release();
305  if (nSize > 0)
306  Alloc(RoundSize(nSize));
307  }
309  void Clean()
310  {
311  if (m_pUnit) {
312  int nMaxUnit = Bit2Size(m_nMaxBits);
313  for (int i = 0; i<nMaxUnit; i++) {
314  if (!Map_noKeyP(m_pUnit[i].key))
315  Map_noKey(m_pUnit[i].key);
316  }
317 
318  m_nUnitNum = 0;
319  }
320  }
322  void Fill(DataT d)
323  {
324  if (m_pUnit) {
325  int nMaxUnit = Bit2Size(m_nMaxBits);
326  for (int i = 0; i<nMaxUnit; i++) {
327  if (!Map_noKeyP(m_pUnit[i].key))
328  m_pUnit[i].value = d;
329  }
330  }
331  }
333  inline bool Locate(KeyT key, int &index) const
334  {
335  //lhash_assert(!Map_noKeyP(key));
336 
337  if (!m_pUnit)
338  return false;
339 
340 
341  if (m_nMaxBits < cn_MinLHashBits)
342  {
343  //���Բ���
344  for (int i = 0; i<m_nUnitNum; i++) {
345  if (Map_equalKey(key, m_pUnit[i].key)) {
346  index = i;
347  return true;
348  }
349  }
350 
351  index = m_nUnitNum;
352  return false;
353  }
354  else
355  {
356 #if _DEBUG
357  int nStep = 1;
358 #endif
359  //��ϣ����
360  bool bFound = false;
361  int nHash = LHash_hashKey(key, m_nMaxBits);
362  for (int i = nHash;; i = (i + 1) % Bit2Size(m_nMaxBits)) {
363  if (Map_noKeyP(m_pUnit[i].key)) {
364  index = i;
365  bFound = false;
366  break;
367  }
368  else if (Map_equalKey(key, m_pUnit[i].key)) {
369  index = i;
370  bFound = true;
371  break;
372  }
373 #if _DEBUG
374  nStep++;
375 #endif
376  }
377 
378 #if _DEBUG
379  //��¼���д���
382  1.0 * nStep / (LHash<KeyT, DataT>::nLocateNum + 1);
384 #endif
385 
386  return bFound;
387  }
388 
389  return false;
390  }
392  DataT *Find(KeyT key, bool &bFound)
393  {
394  int index;
395  if ((bFound = Locate(key, index))) {
396  return &(m_pUnit[index].value);
397  }
398  else {
399  return NULL;
400  }
401  }
403  DataT *Find(KeyT key) {
404  bool bFound;
405  return Find(key, bFound);
406  }
408  DataT *Insert(KeyT key, bool &bFound)
409  {
410  int index;
411 
412  if (!m_pUnit)
413  Alloc(1);
414 
415  if ((bFound = Locate(key, index))) {
416  return &(m_pUnit[index].value);
417  }
418  else {
419  int nNewSize = RoundSize(m_nUnitNum + 1);
420 
421  if (nNewSize > Bit2Size(m_nMaxBits)) { //��Ҫ���¿����ڴ�
422  Unit *pOldUnit = m_pUnit;
423  int nOldUnitNum = m_nUnitNum;
424  int nOldMaxUnit = Bit2Size(m_nMaxBits);
425 
426  m_pUnit = NULL; //��Ҫ���ָ�룬�������Alloc���ͷŵ��ڴ�
427  Alloc(nNewSize); //�����ڴ�
428  m_nUnitNum = nOldUnitNum;
429 
430  if (m_nMaxBits < cn_MinLHashBits) {
431  //ֻ������copy��������
432  memcpy(m_pUnit, pOldUnit, sizeof(pOldUnit[0])*nOldUnitNum);
433  }
434  else {
435  // rehash
436  for (int i = 0; i<nOldMaxUnit; i++) {
437  KeyT curKey = pOldUnit[i].key;
438  if (!Map_noKeyP(curKey)) {
439  Locate(curKey, index);
440  memcpy(&(m_pUnit[index]), &(pOldUnit[i]), sizeof(pOldUnit[i]));
441  }
442  }
443  }
444 
445  delete[] pOldUnit;
446  pOldUnit = NULL;
447 
448  //����Locate
449  Locate(key, index);
450  }
451 
452  m_pUnit[index].key = Map_copyKey(key);
453  memset(&(m_pUnit[index].value), 0, sizeof(m_pUnit[index].value));
454  new (&(m_pUnit[index].value)) DataT; //���ù��캯��
455 
456  m_nUnitNum++;
457  return &(m_pUnit[index].value);
458  }
459  }
461  DataT *Insert(KeyT key)
462  {
463  bool bFound;
464  return Insert(key, bFound);
465  }
468  {
469  if (&other == this)
470  return;
471 
472  if (other.m_pUnit) {
473  int maxSize = Bit2Size(other.m_nMaxBits);
474  Alloc(maxSize);
475  for (int i = 0; i<maxSize; i++) {
476  KeyT thisKey = other.m_pUnit[i].key;
477 
478  if (!Map_noKeyP(thisKey)) {
479  m_pUnit[i].key = Map_copyKey(thisKey);
480 
481  new (&(m_pUnit[i].value)) DataT;
482 
483  m_pUnit[i].value = other.m_pUnit[i].value;
484  }
485  }
486  m_nUnitNum = other.m_nUnitNum;
487  }
488  else {
489  // if (m_pUnit)
490  // cout<<"!!!!!"<<endl;
491  Clean();
492  }
493  }
494  };
495 
496 
497 #ifdef _DEBUG
498  template <class KeyT, class DataT>
500  template <class KeyT, class DataT>
502 #endif
503 
504 
508  template <class KeyT, class DataT>
509  class LHashIter
510  {
511  private:
512  LHash<KeyT, DataT> *m_phash;
513  int m_nCurIndex;
514 
515  bool(*m_sortFun)(KeyT, KeyT);
516  KeyT *m_pSortedKey;
517  public:
519  LHashIter(LHash<KeyT, DataT> *phash, bool(*sort)(KeyT, KeyT) = 0) : m_phash(phash),m_sortFun(sort)
520  {
521  m_nCurIndex = 0;
522  m_pSortedKey = NULL;
523  Init();
524  }
527  {
528  if (m_pSortedKey)
529  delete[] m_pSortedKey;
530  m_pSortedKey = NULL;
531  }
533  void Init()
534  {
535  m_nCurIndex = 0;
536  if (!m_phash)
537  return;
538 
539  if (m_sortFun) {
540  //��Key����
541  if (m_pSortedKey)
542  delete[] m_pSortedKey;
543  m_pSortedKey = NULL;
544 
545  if (m_phash->GetNum() > 0) {
546  m_pSortedKey = new KeyT[m_phash->GetNum()];
547  int curKey = 0;
548 
549  int maxSize = Bit2Size(m_phash->m_nMaxBits);
550  for (int i = 0; i<maxSize; i++) {
551  if (!Map_noKeyP(m_phash->m_pUnit[i].key)) {
552  m_pSortedKey[curKey] = m_phash->m_pUnit[i].key;
553  curKey++;
554  }
555  }
556 
557  //����
558  for (int i = 0; i<curKey - 1; i++) {
559  for (int j = i + 1; j<curKey; j++) {
560  if (!m_sortFun(m_pSortedKey[i], m_pSortedKey[j])) {
561  KeyT t = m_pSortedKey[i];
562  m_pSortedKey[i] = m_pSortedKey[j];
563  m_pSortedKey[j] = t;
564  }
565  }
566  }
567  }
568 
569  }
570  }
572 
576  DataT* Next(KeyT &key)
577  {
578  if (!m_phash || m_phash->m_pUnit == NULL || m_phash->m_nUnitNum == 0)
579  return NULL;
580 
581  if (m_sortFun)
582  {
583  //����
584  if (m_nCurIndex == m_phash->GetNum())
585  return NULL;
586 
587  key = m_pSortedKey[m_nCurIndex++];
588  return m_phash->Find(key);
589  }
590  else
591  {
592  for (; m_nCurIndex < m_phash->GetSize(); m_nCurIndex++) {
593  key = m_phash->m_pUnit[m_nCurIndex].key;
594  if (!Map_noKeyP(key)) {
595  return &(m_phash->m_pUnit[m_nCurIndex++].value);
596  }
597  }
598  }
599 
600  return NULL;
601  }
602  };
603 
606 }
607 #endif
bool Locate(KeyT key, int &index) const
Find the key.
Definition: wb-lhash.h:333
const int c_IntNokeyValue
nokey value for int
Definition: wb-lhash.h:67
bool Map_equalKey(const char *pStr1, const char *pStr2)
Definition: wb-lhash.h:118
DataT * Next(KeyT &key)
get next value
Definition: wb-lhash.h:576
void Alloc(int nSize)
Allocate the buffer.
Definition: wb-lhash.h:255
LHashIter(LHash< KeyT, DataT > *phash, bool(*sort)(KeyT, KeyT)=0)
constructor
Definition: wb-lhash.h:519
const short c_ShortNokeyValue
nokey value for short
Definition: wb-lhash.h:66
int m_nUnitNum
The unit number add to the hash.
Definition: wb-lhash.h:221
int m_nMaxBits
The bit number of the memory buffer. (2^m_nMaxBits is the memory size)
Definition: wb-lhash.h:220
const int cn_MinLHashBits
if the bits of hash less than this value, using linear table
Definition: wb-lhash.h:197
#define HashMask(nbits)
Definition: wb-lhash.h:121
void Clean()
Clean the hash, but don&#39;t release the buffer.
Definition: wb-lhash.h:309
size_t TotalMemCost()
Compute the whole memory cost of hash structure.
Definition: wb-lhash.h:243
bool LHash_IncSort(const char *p1, const char *p2)
Definition: wb-lhash.h:194
const double c_doubleNokeyValue
nokey value for double
Definition: wb-lhash.h:74
int GetSize() const
Get the buffer size of hash.
Definition: wb-lhash.h:237
int RoundSize(int nSize)
calculate the buffer size need to be allocated
Definition: wb-lhash.h:248
~LHashIter()
destructor
Definition: wb-lhash.h:526
Unit * m_pUnit
pointer to the buffer.
Definition: wb-lhash.h:222
const float cf_HashRatio
fill-rate of the hash.
Definition: wb-lhash.h:199
~LHash()
destructor
Definition: wb-lhash.h:233
void Init()
initialize the iter
Definition: wb-lhash.h:533
LHash(int nSize=0)
constructor
Definition: wb-lhash.h:231
const long c_LongNokeyValue
nokey value for long
Definition: wb-lhash.h:68
the Unit of hash
Definition: wb-lhash.h:214
unsigned long LHash_hashKey(const char *key, unsigned maxBits)
hash for string
Definition: wb-lhash.h:177
void Map_freeKey(const char *key)
Free a key, if the key is string.
Definition: wb-lhash.h:57
void Release()
Release the buffer.
Definition: wb-lhash.h:284
int GetNum() const
Get the unit number.
Definition: wb-lhash.h:235
void Map_noKey(double &key)
Definition: wb-lhash.h:90
the iter of LHash
Definition: wb-lhash.h:42
DataT * Insert(KeyT key)
Insert a value.
Definition: wb-lhash.h:461
const unsigned c_UIntNokeyValue
nokey value for unsigned int
Definition: wb-lhash.h:71
const long unsigned c_ULongNokeyValue
nokey value for unsigned long
Definition: wb-lhash.h:72
bool Map_noKeyP(double &key)
Definition: wb-lhash.h:107
DataT * Insert(KeyT key, bool &bFound)
Insert a value.
Definition: wb-lhash.h:408
int GetMaxBits() const
Get the buffer size bits.
Definition: wb-lhash.h:239
void Reset(int nSize)
Reset the hash buffer.
Definition: wb-lhash.h:302
DataT * Find(KeyT key)
Find a value.
Definition: wb-lhash.h:403
KeyT key
the key
Definition: wb-lhash.h:216
const float c_floatNokeyValue
nokey value for float
Definition: wb-lhash.h:73
const char * Map_copyKey(const char *key)
Copy a string key.
Definition: wb-lhash.h:50
#define Bit2Size(bits)
Definition: wb-lhash.h:34
const long long c_LLongNokeyValue
nokey value for longlong
Definition: wb-lhash.h:69
const int cn_MaxLHashBits
the maximum bits number support by hast
Definition: wb-lhash.h:198
void Copy(LHash< KeyT, DataT > &other)
Copy hash.
Definition: wb-lhash.h:467
void Fill(DataT d)
Set all the values to d.
Definition: wb-lhash.h:322
define all the code written by Bin Wang.
Definition: wb-file.cpp:21
const short unsigned c_UShortNokeyValue
nokey value for unsigned short
Definition: wb-lhash.h:70
a linear hash table
Definition: wb-lhash.h:41
DataT value
the value
Definition: wb-lhash.h:217
DataT * Find(KeyT key, bool &bFound)
Find a value.
Definition: wb-lhash.h:392
const Unit * GetBuffer() const
Get the buffer.
Definition: wb-lhash.h:241