SDSL  3.0.0
Succinct Data Structure Library
wt_hutu.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
9 #ifndef INCLUDED_SDSL_WT_HUTU
10 #define INCLUDED_SDSL_WT_HUTU
11 
12 #include <vector>
13 
14 #include <sdsl/wt_pc.hpp>
15 
17 namespace sdsl
18 {
19 
20 // forward declaration
21 struct hutu_shape;
22 
24 
37 template <class t_bitvector = bit_vector,
38  class t_rank = typename t_bitvector::rank_1_type,
39  class t_select = typename t_bitvector::select_1_type,
40  class t_select_zero = typename t_bitvector::select_0_type,
41  class t_tree_strat = byte_tree<>>
43 
44 // Hu Tucker shape for wt_pc
45 template <class t_wt>
47 {
48  typedef typename t_wt::size_type size_type;
49  enum
50  {
51  lex_ordered = 1
52  };
53 
55  template <class t_element>
56  struct heap_node
57  {
58  t_element * item; // pointer to the represented item
59  heap_node *left, *right, *parent; // pointer to left/right child, parent
60  int64_t rank; // rank of the heap node
62  heap_node(t_element * it = nullptr)
63  : item(it)
64  , left(nullptr)
65  , right(nullptr)
66  , parent(nullptr)
67  , rank(0)
68  {}
70  bool operator<(const heap_node & other) { return *item < *(other.item); }
71  };
72 
73  // Implementation of a leftist heap as needed in the first phase of
74  // Hu-Tucker Code construction
75  template <class t_element>
76  class l_heap
77  {
78  private:
79  heap_node<t_element> * m_root; // pointer to the root
80 
81  // fixes node information after the deletion of elements
82  void fix_node(heap_node<t_element> * item)
83  {
84  if (item != nullptr)
85  {
86  if (!item->left || !item->right)
87  { // if node has only one child
88  // only go on fixing if the node information needs to be changed
89  if (item->rank != 0)
90  {
91  item->rank = 0;
92  if (item->parent) fix_node(item->parent);
93  }
94  }
95  else
96  { // node information has to be adapted
97  int64_t nn = (item->left->rank > item->right->rank) ? item->right->rank : item->left->rank;
98  if (item->rank != nn && item->parent != 0)
99  {
100  item->rank = nn;
101  fix_node(item->parent);
102  }
103  }
104  }
105  }
106 
107  // helper function to remove the data structure from memory
108  void free_node(heap_node<t_element> * item)
109  {
110  if (item->left)
111  {
112  free_node(item->left);
113  delete item->left;
114  item->left = nullptr;
115  }
116  if (item->right)
117  {
118  free_node(item->right);
119  delete item->right;
120  item->right = nullptr;
121  }
122  }
123 
124  // internal merge function
126  {
127  if (!h1) return h2;
128  if (!h2) return h1;
129  if (*(h1->item) < *(h2->item))
130  return merge1(h1, h2);
131  else
132  return merge1(h2, h1);
133  }
134  // internal merge function
136  {
137  if (!h1->left)
138  { // if h1 has no children, the merge is simple
139  h1->left = h2;
140  h2->parent = h1; // adjust the parent pointer
141  }
142  else
143  {
144  h1->right = merge(h1->right, h2);
145  if (h1->right) { h1->right->parent = h1; }
146 
147  if ((h1->left->rank) < (h1->right->rank))
148  {
149  heap_node<t_element> * tmp = h1->left;
150  h1->left = h1->right;
151  h1->right = tmp;
152  }
153  h1->rank = h1->right->rank + 1;
154  }
155  return h1;
156  }
157 
158  public:
161  : m_root(nullptr)
162  {}
163 
165  bool empty() const { return (m_root == nullptr); }
166 
168 
171  heap_node<t_element> * find_min() const { return m_root; }
172 
174 
178  {
179  if (m_root == nullptr) return nullptr;
180  if (m_root->left == nullptr) return m_root->right;
181  if (m_root->right == nullptr) return m_root->left;
182 
183  if (m_root->left->operator<(*m_root->right))
184  return m_root->left;
185  else
186  return m_root->right;
187  }
188 
190 
193  heap_node<t_element> * insert(t_element * x)
194  {
197  lh.m_root = n;
198  merge(&lh);
199  return n;
200  }
201 
203  void delete_min()
204  {
205  heap_node<t_element> * old_root = m_root;
206  m_root = merge(m_root->left, m_root->right);
207  if (m_root) m_root->parent = nullptr;
208  delete old_root;
209  }
210 
211  // deletes an arbitrary element from the heap
212  // this function assumes, that item is an element of the heap
214  {
215  if (item != nullptr)
216  {
217  if (m_root == item)
218  { // deleting the root is trivial
219  delete_min();
220  }
221  else
222  {
223  // otherwise we have to adapt the parent node and
224  // the children of item
225  heap_node<t_element> * h1 = merge(item->left, item->right);
226  if (h1) h1->parent = item->parent;
227  if (item == item->parent->left) { item->parent->left = h1; }
228  else if (item == item->parent->right)
229  {
230  item->parent->right = h1;
231  }
232  // fix node information considering rank
233  fix_node(item->parent);
234  delete item; // remove the item from memory
235  }
236  }
237  }
238 
239  // public merge function
241  {
242  m_root = merge(m_root, rhs->m_root);
243  rhs->m_root = nullptr;
244  }
245 
246  // removes the whole data structure from memory
247  void free_memory()
248  {
249  if (m_root != nullptr)
250  {
251  free_node(m_root);
252  delete m_root;
253  m_root = nullptr;
254  }
255  }
256  };
257 
258  // forward declaration of node classes
259  struct ht_node;
260 
261  // Master node as used in the first phase of the Hu-Tucker algorithm
262  struct m_node
263  {
264  // min sum of the two min elements of the hpq this node points to
266  int64_t i; // position of the left node in the working sequence
267  int64_t j; // position of the right node in the working sequence
268  // pointer to the corresponding heap element (used for deletion)
270  l_heap<ht_node> * myhpq; // pointer to the hpq
271 
272  ht_node * lt; // pointer to the left- and rightmost leafs of the hpq
273  ht_node * rt; // need for merge operations
274 
276  : qel(0)
277  , myhpq(0)
278  , lt(0)
279  , rt(0)
280  {}
281 
282  bool operator<(const m_node other)
283  {
284  if (min_sum != other.min_sum) { return min_sum < other.min_sum; }
285  if (i != other.i) { return i < other.i; }
286  return j < other.j;
287  }
288 
289  bool operator>(const m_node other) { return other < *this; }
290  };
291 
292  // Hu-Tucker node as used in the first phase of the Hu-Tucker algorithm
293  struct ht_node
294  {
295  int64_t pos; // position of the node
296  uint64_t c; // the represented letter
297  size_type w; // frequency of the node
298  bool t; // whether the node is a leaf
299  int64_t level; // level in the tree
300 
301  // pointer to the two master nodes
302  // (as a node can belong to up to two hpqs)
304  m_node * mpqr; // only mpql is used for inner nodes
305  // pointer to the two heap nodes (as a node can belong to up to two hpqs)
307  heap_node<ht_node> * qr; // only ql is used for inner nodes
308  ht_node * left; // left child
309  ht_node * right; // right child
310 
312  : mpql(0)
313  , mpqr(0)
314  , ql(0)
315  , qr(0)
316  , left(nullptr)
317  , right(nullptr)
318  {}
319 
320  bool operator<(const ht_node & other)
321  {
322  if (w != other.w) { return w < other.w; }
323  return pos < other.pos;
324  }
325 
326  bool operator>(const ht_node & other) { return other < *this; }
327  };
328 
329  template <class t_rac>
330  static void construct_tree(t_rac & C, std::vector<pc_node> & temp_nodes)
331  {
332  // create a leaf for every letter
333  std::vector<ht_node> node_vector;
334  for (size_t i = 0; i < C.size(); i++)
335  {
336  if (C[i])
337  {
338  ht_node n;
339  n.c = (uint64_t)i;
340  n.w = C[i];
341  n.t = true;
342  n.pos = node_vector.size();
343  node_vector.push_back(n);
344  }
345  }
346  if (node_vector.size() == 1)
347  {
348  // special case of an alphabet of size 1:
349  // just instantly create the tree and return it
350  temp_nodes.emplace_back(pc_node(node_vector[0].w, (size_type)node_vector[0].c));
351  return;
352  }
353  size_type sigma = node_vector.size();
354  std::vector<ht_node> T(sigma); // physical leaves
355  std::vector<ht_node *> A(sigma); // the current working sequence
356  // Priority Queues, containing the Huffman Sequences
357  std::vector<l_heap<ht_node>> HPQ(sigma);
358  l_heap<m_node> MPQ; // Master Priority Queue
359 
360  // init T, A, HPQs and MPQ
361  T[0] = node_vector[0];
362  A[0] = &T[0];
363 
364  // initialization needed for every leaf
365  for (size_type i = 1; i < sigma; i++)
366  {
367  T[i] = node_vector[i];
368  A[i] = &T[i];
369 
370  T[i - 1].qr = HPQ[i - 1].insert(&T[i - 1]);
371  T[i].ql = HPQ[i - 1].insert(&T[i]);
372 
373  m_node * m = new m_node();
374  m->min_sum = T[i - 1].w + T[i].w;
375  m->i = i - 1;
376  m->j = i;
377  m->lt = &T[i - 1];
378  m->rt = &T[i];
379  m->myhpq = &HPQ[i - 1];
380 
381  m->qel = MPQ.insert(m);
382 
383  T[i - 1].mpqr = m;
384  T[i].mpql = m;
385  }
386 
387  // main action loop
388  for (size_type k = 1; k < sigma; k++)
389  {
390  m_node * m = MPQ.find_min()->item;
391  ht_node * l = A[m->i];
392  ht_node * r = A[m->j];
393  int64_t lpos = m->i;
394  int64_t rpos = m->j;
395 
396  l_heap<ht_node> * n_hpq = nullptr;
397  ht_node * n_rt = nullptr;
398  ht_node * n_lt = nullptr;
399 
400  // create a new master priority queue
401  m_node * n_m = new m_node();
402  // delete old nodes from all hpqs
403  if (l->t)
404  {
405  if (l->mpql) l->mpql->myhpq->delete_element(l->ql);
406  l->ql = nullptr;
407  if (l->mpqr) l->mpqr->myhpq->delete_element(l->qr);
408  l->qr = nullptr;
409  }
410  else
411  {
412  m->myhpq->delete_element(l->ql);
413  l->ql = nullptr;
414  }
415  if (r->t)
416  {
417  if (r->mpql) r->mpql->myhpq->delete_element(r->ql);
418  l->ql = nullptr;
419 
420  if (r->mpqr) r->mpqr->myhpq->delete_element(r->qr);
421  r->qr = nullptr;
422  }
423  else
424  {
425  m->myhpq->delete_element(r->ql);
426  r->ql = nullptr;
427  }
428  // handle the merge of hpqs
429  if (l->t && r->t)
430  {
431  // both nodes are leaves
432  l_heap<ht_node> * h1 = nullptr;
433  l_heap<ht_node> * h2 = nullptr;
434  l_heap<ht_node> * h3 = nullptr;
435  if (l->mpql)
436  {
437  n_lt = l->mpql->lt;
438  if (n_lt == l) n_lt = nullptr;
439  if (n_lt) n_lt->mpqr = n_m;
440 
441  h1 = l->mpql->myhpq;
442  h2 = l->mpqr->myhpq;
443 
444  h1->merge(h2);
445  MPQ.delete_element(l->mpql->qel);
446  MPQ.delete_element(l->mpqr->qel);
447  delete l->mpql;
448  delete l->mpqr;
449  }
450  else
451  {
452  h1 = l->mpqr->myhpq;
453  h2 = l->mpqr->myhpq;
454  n_lt = nullptr;
455 
456  MPQ.delete_element(l->mpqr->qel);
457  delete l->mpqr;
458  }
459  if (r->mpqr)
460  {
461  n_rt = r->mpqr->rt;
462  if (n_rt == r) n_rt = nullptr;
463  if (n_rt) n_rt->mpql = n_m;
464 
465  h3 = r->mpqr->myhpq;
466  h1->merge(h3);
467  MPQ.delete_element(r->mpqr->qel);
468  delete r->mpqr;
469 
470  n_hpq = h1;
471  if (n_rt) n_rt->mpql = n_m;
472  }
473  else
474  {
475  n_rt = nullptr;
476  n_hpq = h1;
477  }
478  }
479  else if (l->t)
480  { // the left node is a leaf
481  if (l->mpql)
482  {
483  n_lt = l->mpql->lt;
484  if (n_lt) n_lt->mpqr = n_m;
485  n_rt = l->mpqr->rt;
486  if (n_rt) n_rt->mpql = n_m;
487 
488  l->mpql->myhpq->merge(l->mpqr->myhpq);
489  n_hpq = l->mpql->myhpq;
490  MPQ.delete_element(l->mpql->qel);
491  MPQ.delete_element(l->mpqr->qel);
492  delete l->mpql;
493  delete l->mpqr;
494  }
495  else
496  {
497  n_lt = nullptr;
498  n_rt = l->mpqr->rt;
499  if (n_rt) n_rt->mpql = n_m;
500 
501  n_hpq = l->mpqr->myhpq;
502  MPQ.delete_element(l->mpqr->qel);
503  delete l->mpqr;
504  }
505  }
506  else if (r->t)
507  { // right node is a leaf
508  if (r->mpqr)
509  {
510  n_lt = r->mpql->lt;
511  if (n_lt) n_lt->mpqr = n_m;
512  n_rt = r->mpqr->rt;
513  if (n_rt) n_rt->mpql = n_m;
514 
515  r->mpql->myhpq->merge(r->mpqr->myhpq);
516  n_hpq = r->mpql->myhpq;
517  MPQ.delete_element(r->mpql->qel);
518  MPQ.delete_element(r->mpqr->qel);
519  delete r->mpql;
520  delete r->mpqr;
521  }
522  else
523  {
524  n_lt = r->mpql->lt;
525  if (n_lt) n_lt->mpqr = n_m;
526  n_rt = nullptr;
527 
528  n_hpq = r->mpql->myhpq;
529  MPQ.delete_element(r->mpql->qel);
530  delete r->mpql;
531  }
532  }
533  else
534  {
535  // merge of two inner nodes
536  // no need to merge hpqs
537  MPQ.delete_element(m->qel);
538 
539  n_hpq = m->myhpq;
540  n_lt = m->lt;
541  n_rt = m->rt;
542 
543  if (n_lt) n_lt->mpqr = n_m;
544  if (n_rt) n_rt->mpql = n_m;
545 
546  delete m;
547  }
548 
549  // create a new node with the information gained above
550  ht_node * new_node = new ht_node();
551  new_node->c = ' ';
552  new_node->w = l->w + r->w;
553  new_node->t = false;
554  new_node->pos = lpos;
555  new_node->left = l;
556  new_node->right = r;
557  // insert node to the correct hpq
558  new_node->ql = n_hpq->insert(new_node);
559 
560  // update working sequence
561  A[lpos] = new_node;
562  A[rpos] = nullptr;
563  // update information in the new master node and reinsert it to mpq
564  ht_node * tmp_min = n_hpq->find_min()->item;
565  heap_node<ht_node> * tmpsnd = n_hpq->find_snd_min();
566  if (tmpsnd)
567  {
568  ht_node * tmp_snd = n_hpq->find_snd_min()->item;
569  n_m->min_sum = tmp_min->w + tmp_snd->w;
570 
571  if (tmp_min->pos < tmp_snd->pos)
572  {
573  n_m->i = tmp_min->pos;
574  n_m->j = tmp_snd->pos;
575  }
576  else
577  {
578  n_m->i = tmp_snd->pos;
579  n_m->j = tmp_min->pos;
580  }
581  n_m->qel = MPQ.insert(n_m);
582  n_m->myhpq = n_hpq;
583  n_m->lt = n_lt;
584  n_m->rt = n_rt;
585  }
586  else
587  {
588  // free the last remaining hpq
589  n_hpq->free_memory();
590  delete n_m;
591  }
592  }
593 
594  // level assignment and deletion of unneeded nodes
595  assign_level(A[0], 0);
596 
597  // reconstruction phase using the stack algorithm
598  std::vector<ht_node *> stack(sigma, nullptr);
599 
600  for (size_type i = 0; i < sigma; i++)
601  {
602  temp_nodes.emplace_back(pc_node(T[i].w, (size_type)T[i].c));
603  T[i].pos = i;
604  }
605 
606  int64_t spointer = -1;
607  uint64_t qpointer = 0; // use the Array T as a stack
608  int64_t max_nodes = sigma;
609  while (qpointer < sigma or spointer >= 1LL)
610  {
611  if (spointer >= 1LL and (stack[spointer]->level == stack[spointer - 1]->level))
612  {
613  ht_node * n_node = new ht_node();
614  max_nodes++;
615  n_node->t = false;
616  n_node->left = stack[spointer - 1];
617  n_node->right = stack[spointer];
618  n_node->level = stack[spointer]->level - 1;
619  n_node->w = stack[spointer]->w + stack[spointer - 1]->w;
620  n_node->c = '|';
621 
622  n_node->pos = temp_nodes.size();
623  temp_nodes[stack[spointer - 1]->pos].parent = temp_nodes.size();
624  temp_nodes[stack[spointer]->pos].parent = temp_nodes.size();
625  temp_nodes.emplace_back(pc_node(n_node->w,
626  0,
628  stack[spointer - 1]->pos,
629  stack[spointer]->pos));
630 
631  if (!stack[spointer - 1]->t) delete stack[spointer - 1];
632  if (!stack[spointer]->t) delete stack[spointer];
633 
634  stack[--spointer] = n_node;
635  }
636  else
637  {
638  stack[++spointer] = &T[qpointer++];
639  }
640  }
641  delete stack[0];
642  }
643 
644  static void assign_level(ht_node * n, int64_t lvl)
645  {
646  if (n)
647  {
648  n->level = lvl;
649  assign_level(n->left, lvl + 1);
650  assign_level(n->right, lvl + 1);
651 
652  if (!n->t) { delete n; }
653  }
654  }
655 };
656 
658 {
659  template <class t_wt>
661 };
662 
663 } // end namespace sdsl
664 
665 #endif // end file
void merge(l_heap< t_element > *rhs)
Definition: wt_hutu.hpp:240
void delete_element(heap_node< t_element > *item)
Definition: wt_hutu.hpp:213
l_heap()
Default constructor.
Definition: wt_hutu.hpp:160
heap_node< t_element > * find_min() const
Get the smallest element.
Definition: wt_hutu.hpp:171
bool empty() const
Indicates if the heap is empty.
Definition: wt_hutu.hpp:165
heap_node< t_element > * find_snd_min() const
Get the second smallest element.
Definition: wt_hutu.hpp:177
void delete_min()
Delete the smallest element in the heap.
Definition: wt_hutu.hpp:203
heap_node< t_element > * insert(t_element *x)
Insert an element into the heap.
Definition: wt_hutu.hpp:193
A prefix code-shaped wavelet.
Definition: wt_pc.hpp:44
int_vector ::size_type size_type
Namespace for the succinct data structure library.
int_vector< 1 > bit_vector
bit_vector is a specialization of the int_vector.
Definition: int_vector.hpp:51
Node class used by the leftist heap.
Definition: wt_hutu.hpp:57
heap_node(t_element *it=nullptr)
Constructor.
Definition: wt_hutu.hpp:62
bool operator<(const heap_node &other)
Less then operator.
Definition: wt_hutu.hpp:70
heap_node< ht_node > * qr
Definition: wt_hutu.hpp:307
bool operator>(const ht_node &other)
Definition: wt_hutu.hpp:326
heap_node< ht_node > * ql
Definition: wt_hutu.hpp:306
bool operator<(const ht_node &other)
Definition: wt_hutu.hpp:320
bool operator<(const m_node other)
Definition: wt_hutu.hpp:282
l_heap< ht_node > * myhpq
Definition: wt_hutu.hpp:270
bool operator>(const m_node other)
Definition: wt_hutu.hpp:289
heap_node< m_node > * qel
Definition: wt_hutu.hpp:269
static void construct_tree(t_rac &C, std::vector< pc_node > &temp_nodes)
Definition: wt_hutu.hpp:330
t_wt::size_type size_type
Definition: wt_hutu.hpp:48
static void assign_level(ht_node *n, int64_t lvl)
Definition: wt_hutu.hpp:644
wt_pc.hpp contains a class for the wavelet tree of byte sequences.