SDSL  3.0.0
Succinct Data Structure Library
wt_rlmn.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
9 #ifndef INCLUDED_SDSL_WT_RLMN
10 #define INCLUDED_SDSL_WT_RLMN
11 
12 #include <algorithm> // for std::swap
13 #include <iostream>
14 #include <queue>
15 #include <stdexcept>
16 #include <utility> // for pair
17 #include <vector>
18 
19 #include <sdsl/int_vector.hpp>
20 #include <sdsl/sd_vector.hpp> // for standard initialisation of template parameters
21 #include <sdsl/sdsl_concepts.hpp>
22 #include <sdsl/util.hpp>
23 #include <sdsl/wt_huff.hpp>
24 
26 namespace sdsl
27 {
28 
29 template <class t_alphabet_cat>
31 {
32  enum
33  {
34  width = 0
35  };
38 
39  static std::map<uint64_t, uint64_t> temp_C() { return std::map<uint64_t, uint64_t>(); }
40 
41  static C_type init_C(std::map<uint64_t, uint64_t> & C, uint64_t size)
42  {
43  uint64_t max_symbol = (--C.end())->first;
44  return C_type(max_symbol + 1, 0, bits::hi(size) + 1);
45  }
46 
47  static C_bf_rank_type init_C_bf_rank(const C_type & C, uint64_t size)
48  {
49  return C_bf_rank_type(C.size(), 0, bits::hi(size) + 1);
50  }
51 };
52 
53 template <>
55 {
56  enum
57  {
58  width = 8
59  };
62 
63  static int_vector<64> temp_C() { return int_vector<64>(256, 0); }
64 
65  static C_type init_C(C_type & C, uint64_t) { return C; }
66 
67  static C_bf_rank_type init_C_bf_rank(const C_type &, uint64_t) { return int_vector<64>(256, 0); }
68 };
69 
71 
90 template <class t_bitvector = sd_vector<>,
91  class t_rank = typename t_bitvector::rank_1_type,
92  class t_select = typename t_bitvector::select_1_type,
93  class t_wt = wt_huff<>>
94 class wt_rlmn
95 {
96  public:
97  typedef t_wt wt_type;
99  typedef typename t_wt::value_type value_type;
100  typedef typename t_bitvector::difference_type difference_type;
103  typedef t_bitvector bit_vector_type;
104  typedef t_rank rank_support_type;
105  typedef t_select select_support_type;
107  typedef typename t_wt::alphabet_category alphabet_category;
108  enum
109  {
110  lex_ordered = false
111  }; // TODO: is should be possible
112  enum
113  {
115  };
118  // to support all lex_ordered
119  // operations if t_wt::lex_ordered is
120  // true
121 
122  private:
123  size_type m_size = 0; // size of the original input sequence
124  bit_vector_type m_bl; // bit vector for starts of runs in
125  // the BWT (or last column), i.e. _b_ _l_ast
126  bit_vector_type m_bf; // bit vector for starts of runs in
127  // the first column of the sorted suffixes, i.e _b_ _f_irst
128  wt_type m_wt; // wavelet tree for all levels
129  // two equal chars
130  rank_support_type m_bl_rank; // rank support for bit vector bl
131  rank_support_type m_bf_rank; // rank support for bit vector bf
132  select_support_type m_bl_select; // select support for bit vector bl
133  select_support_type m_bf_select; // select support for bit vector bf
134  C_type m_C; //
135  C_bf_rank_type m_C_bf_rank; // stores the number of 1s in m_bf for
136  // the prefixes m_bf[0..m_C[0]],m_bf[0..m_C[1]],....,m_bf[0..m_C[255]];
137  // named C_s in the original paper
138 
139  public:
140  const size_type & sigma = m_wt.sigma;
141 
142  // Default constructor
143  wt_rlmn() = default;
144 
146 
151  template <typename t_it>
152  wt_rlmn(t_it begin, t_it end, std::string tmp_dir = ram_file_name(""))
153  : m_size(std::distance(begin, end))
154  {
155  std::string temp_file = tmp_dir + +"_wt_rlmn_" + util::to_string(util::pid()) + "_" +
157  {
158  if (0 == m_size) return;
159  int_vector_buffer<width> condensed_wt(temp_file, std::ios::out);
160  // scope for bl and bf
161  bit_vector bl = bit_vector(m_size, 0);
162 
164  value_type last_c = (value_type)0;
165  size_type j = 0;
166  for (auto it = begin; it != end; ++it, ++j)
167  {
168  value_type c = *it;
169  if (last_c != c or it == begin)
170  {
171  bl[j] = 1;
172  condensed_wt.push_back(c);
173  }
174  ++C[c];
175  last_c = c;
176  }
177  condensed_wt.close();
179 
180  for (size_type i = 0, prefix_sum = 0; i < m_C.size(); ++i)
181  {
182  m_C[i] = prefix_sum;
183  prefix_sum += C[i];
184  }
185 
186  C_type lf_map = m_C;
187  bit_vector bf = bit_vector(m_size + 1, 0);
188  bf[m_size] = 1; // initialize last element
189  j = 0;
190  for (auto it = begin; it != end; ++it, ++j)
191  {
192  value_type c = *it;
193  if (bl[j]) { bf[lf_map[c]] = 1; }
194  ++lf_map[c];
195  }
196  {
197  int_vector_buffer<width> temp_bwt_buf(temp_file);
198  m_wt = wt_type(temp_bwt_buf.begin(), temp_bwt_buf.end(), tmp_dir);
199  }
200  sdsl::remove(temp_file);
201  m_bl = bit_vector_type(std::move(bl));
202  m_bf = bit_vector_type(std::move(bf));
203  }
204 
205  util::init_support(m_bl_rank, &m_bl);
206  util::init_support(m_bf_rank, &m_bf);
207  util::init_support(m_bf_select, &m_bf);
208  util::init_support(m_bl_select, &m_bl);
209 
210  m_C_bf_rank = wt_rlmn_trait<alphabet_category>::init_C_bf_rank(m_C, m_size);
211  for (size_type i = 0; i < m_C.size(); ++i) { m_C_bf_rank[i] = m_bf_rank(m_C[i]); }
212  }
213 
215  wt_rlmn(const wt_rlmn & wt)
216  : m_size(wt.m_size)
217  , m_bl(wt.m_bl)
218  , m_bf(wt.m_bf)
219  , m_wt(wt.m_wt)
220  , m_bl_rank(wt.m_bl_rank)
221  , m_bf_rank(wt.m_bf_rank)
222  , m_bl_select(wt.m_bl_select)
223  , m_bf_select(wt.m_bf_select)
224  , m_C(wt.m_C)
225  , m_C_bf_rank(wt.m_C_bf_rank)
226  {
227  m_bl_rank.set_vector(&m_bl);
228  m_bf_rank.set_vector(&m_bf);
229  m_bl_select.set_vector(&m_bl);
230  m_bf_select.set_vector(&m_bf);
231  }
232 
235  : m_size(wt.m_size)
236  , m_bl(std::move(wt.m_bl))
237  , m_bf(std::move(wt.m_bf))
238  , m_wt(std::move(wt.m_wt))
239  , m_bl_rank(std::move(wt.m_bl_rank))
240  , m_bf_rank(std::move(wt.m_bf_rank))
241  , m_bl_select(std::move(wt.m_bl_select))
242  , m_bf_select(std::move(wt.m_bf_select))
243  , m_C(std::move(wt.m_C))
244  , m_C_bf_rank(std::move(wt.m_C_bf_rank))
245  {
246  m_bl_rank.set_vector(&m_bl);
247  m_bf_rank.set_vector(&m_bf);
248  m_bl_select.set_vector(&m_bl);
249  m_bf_select.set_vector(&m_bf);
250  }
251 
253  wt_rlmn & operator=(const wt_rlmn & wt)
254  {
255  if (this != &wt)
256  {
257  wt_rlmn tmp(wt);
258  *this = std::move(tmp);
259  }
260  return *this;
261  }
262 
265  {
266  if (this != &wt)
267  {
268  m_size = std::move(wt.m_size);
269  m_bl = std::move(wt.m_bl);
270  m_bf = std::move(wt.m_bf);
271  m_wt = std::move(wt.m_wt);
272  m_bl_rank = std::move(wt.m_bl_rank);
273  m_bl_rank.set_vector(&m_bl);
274  m_bf_rank = std::move(wt.m_bf_rank);
275  m_bf_rank.set_vector(&m_bf);
276  m_bl_select = std::move(wt.m_bl_select);
277  m_bl_select.set_vector(&m_bl);
278  m_bf_select = std::move(wt.m_bf_select);
279  m_bf_select.set_vector(&m_bf);
280  m_C = std::move(wt.m_C);
281  m_C_bf_rank = std::move(wt.m_C_bf_rank);
282  }
283  return *this;
284  }
285 
287  size_type size() const { return m_size; }
288 
290  bool empty() const { return 0 == m_size; }
291 
293 
300  {
301  assert(i < size());
302  return m_wt[m_bl_rank(i + 1) - 1];
303  };
304 
306 
315  {
316  assert(i <= size());
317  if (i == 0) return 0;
318  size_type wt_ex_pos = m_bl_rank(i);
319  size_type c_runs = m_wt.rank(wt_ex_pos, c);
320  if (c_runs == 0) return 0;
321  if (m_wt[wt_ex_pos - 1] == c)
322  {
323  size_type c_run_begin = m_bl_select(wt_ex_pos);
324  return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin;
325  }
326  else
327  {
328  return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c];
329  }
330  };
331 
333 
339  std::pair<size_type, value_type> inverse_select(size_type i) const
340  {
341  assert(i < size());
342  if (i == 0) { return std::make_pair(0, m_wt[0]); }
343  size_type wt_ex_pos = m_bl_rank(i + 1);
344  auto rc = m_wt.inverse_select(wt_ex_pos - 1);
345  size_type c_runs = rc.first + 1;
346  value_type c = rc.second;
347  if (c_runs == 0) return std::make_pair(0, c);
348  if (m_wt[wt_ex_pos - 1] == c)
349  {
350  size_type c_run_begin = m_bl_select(wt_ex_pos);
351  return std::make_pair(m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin, c);
352  }
353  else
354  {
355  return std::make_pair(m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c], c);
356  }
357  }
358 
360 
368  {
369  assert(i > 0);
370  assert(i <= rank(size(), c));
371  size_type c_runs = m_bf_rank(m_C[c] + i) - m_C_bf_rank[c];
372  size_type offset = m_C[c] + i - 1 - m_bf_select(c_runs + m_C_bf_rank[c]);
373  return m_bl_select(m_wt.select(c_runs, c) + 1) + offset;
374  };
375 
377  const_iterator begin() const { return const_iterator(this, 0); }
378 
380  const_iterator end() const { return const_iterator(this, size()); }
381 
383  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
384  {
385  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
386  size_type written_bytes = 0;
387  written_bytes += write_member(m_size, out, child, "size");
388  written_bytes += m_bl.serialize(out, child, "bl");
389  written_bytes += m_bf.serialize(out, child, "bf");
390  written_bytes += m_wt.serialize(out, child, "wt");
391  written_bytes += m_bl_rank.serialize(out, child, "bl_rank");
392  written_bytes += m_bf_rank.serialize(out, child, "bf_rank");
393  written_bytes += m_bl_select.serialize(out, child, "bl_select");
394  written_bytes += m_bf_select.serialize(out, child, "bf_select");
395  written_bytes += m_C.serialize(out, child, "C");
396  written_bytes += m_C_bf_rank.serialize(out, child, "C_bf_rank");
397  structure_tree::add_size(child, written_bytes);
398  return written_bytes;
399  }
400 
402  void load(std::istream & in)
403  {
404  read_member(m_size, in);
405  m_bl.load(in);
406  m_bf.load(in);
407  m_wt.load(in);
408  m_bl_rank.load(in, &m_bl);
409  m_bf_rank.load(in, &m_bf);
410  m_bl_select.load(in, &m_bl);
411  m_bf_select.load(in, &m_bf);
412  m_C.load(in);
413  m_C_bf_rank.load(in);
414  }
415 
417  template <typename archive_t>
418  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
419  {
420  ar(CEREAL_NVP(m_size));
421  ar(CEREAL_NVP(m_bl));
422  ar(CEREAL_NVP(m_bf));
423  ar(CEREAL_NVP(m_wt));
424  ar(CEREAL_NVP(m_bl_rank));
425  ar(CEREAL_NVP(m_bf_rank));
426  ar(CEREAL_NVP(m_bl_select));
427  ar(CEREAL_NVP(m_bf_select));
428  ar(CEREAL_NVP(m_C));
429  ar(CEREAL_NVP(m_C_bf_rank));
430  }
431 
433  template <typename archive_t>
434  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
435  {
436  ar(CEREAL_NVP(m_size));
437  ar(CEREAL_NVP(m_bl));
438  ar(CEREAL_NVP(m_bf));
439  ar(CEREAL_NVP(m_wt));
440  ar(CEREAL_NVP(m_bl_rank));
441  m_bl_rank.set_vector(&m_bl);
442  ar(CEREAL_NVP(m_bf_rank));
443  m_bf_rank.set_vector(&m_bf);
444  ar(CEREAL_NVP(m_bl_select));
445  m_bl_select.set_vector(&m_bl);
446  ar(CEREAL_NVP(m_bf_select));
447  m_bf_select.set_vector(&m_bf);
448  ar(CEREAL_NVP(m_C));
449  ar(CEREAL_NVP(m_C_bf_rank));
450  }
451 
453  bool operator==(wt_rlmn const & other) const noexcept
454  {
455  return (m_size == other.m_size) && (m_bl == other.m_bl) && (m_bf == other.m_bf) && (m_wt == other.m_wt) &&
456  (m_bl_rank == other.m_bl_rank) && (m_bf_rank == other.m_bf_rank) && (m_bl_select == other.m_bl_select) &&
457  (m_bf_select == other.m_bf_select) && (m_C == other.m_C) && (m_C_bf_rank == other.m_C_bf_rank);
458  }
459 
461  bool operator!=(wt_rlmn const & other) const noexcept { return !(*this == other); }
462 };
463 
464 } // end namespace sdsl
465 #endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
void close(bool remove_file=false)
Close the int_vector_buffer.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
size_type size() const noexcept
The number of elements in the int_vector.
Generic iterator for a random access container.
Definition: iterators.hpp:24
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
A Wavelet Tree class for byte sequences.
Definition: wt_rlmn.hpp:95
t_bitvector bit_vector_type
Definition: wt_rlmn.hpp:103
wt_rlmn_trait< alphabet_category >::C_bf_rank_type C_bf_rank_type
Definition: wt_rlmn.hpp:117
t_wt::alphabet_category alphabet_category
Definition: wt_rlmn.hpp:107
wt_rlmn()=default
random_access_const_iterator< wt_rlmn > const_iterator
Definition: wt_rlmn.hpp:101
wt_rlmn_trait< alphabet_category >::C_type C_type
Definition: wt_rlmn.hpp:116
bool operator==(wt_rlmn const &other) const noexcept
Equality operator.
Definition: wt_rlmn.hpp:453
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition: wt_rlmn.hpp:314
t_select select_support_type
Definition: wt_rlmn.hpp:105
size_type size() const
Returns the size of the original vector.
Definition: wt_rlmn.hpp:287
t_wt::value_type value_type
Definition: wt_rlmn.hpp:99
const size_type & sigma
Definition: wt_rlmn.hpp:140
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_rlmn.hpp:402
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Load via cereal.
Definition: wt_rlmn.hpp:434
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_rlmn.hpp:290
wt_rlmn(wt_rlmn &&wt)
Move constructor.
Definition: wt_rlmn.hpp:234
value_type operator[](size_type i) const
Recovers the i-th symbol of the original vector.
Definition: wt_rlmn.hpp:299
wt_rlmn(const wt_rlmn &wt)
Copy constructor.
Definition: wt_rlmn.hpp:215
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many times symbol wt[i] occurs in the prefix [0..i-1].
Definition: wt_rlmn.hpp:339
bool operator!=(wt_rlmn const &other) const noexcept
Inequality operator.
Definition: wt_rlmn.hpp:461
t_wt wt_type
Definition: wt_rlmn.hpp:97
wt_tag index_category
Definition: wt_rlmn.hpp:106
wt_rlmn(t_it begin, t_it end, std::string tmp_dir=ram_file_name(""))
Construct the wavelet tree from a sequence defined by two interators.
Definition: wt_rlmn.hpp:152
int_vector ::size_type size_type
Definition: wt_rlmn.hpp:98
t_rank rank_support_type
Definition: wt_rlmn.hpp:104
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_rlmn.hpp:383
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: wt_rlmn.hpp:377
wt_rlmn & operator=(const wt_rlmn &wt)
Assignment operator.
Definition: wt_rlmn.hpp:253
t_bitvector::difference_type difference_type
Definition: wt_rlmn.hpp:100
const_iterator iterator
Definition: wt_rlmn.hpp:102
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition: wt_rlmn.hpp:418
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: wt_rlmn.hpp:380
size_type select(size_type i, value_type c) const
Calculates the ith occurrence of the symbol c in the supported vector.
Definition: wt_rlmn.hpp:367
wt_rlmn & operator=(wt_rlmn &&wt)
Assignment move operator.
Definition: wt_rlmn.hpp:264
int_vector.hpp contains the sdsl::int_vector class.
uint64_t id()
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
std::string ram_file_name(const std::string &file)
Returns the corresponding RAM-file name for file.
Definition: ram_fs.hpp:174
size_t write_member(const T &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition: io.hpp:84
void read_member(T &t, std::istream &in)
Definition: io.hpp:111
int_vector< 1 > bit_vector
bit_vector is a specialization of the int_vector.
Definition: int_vector.hpp:51
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
sd_vector.hpp contains the sdsl::sd_vector class, and classes which support rank and select for sd_ve...
Contains declarations and definitions of data structure concepts.
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
static int_vector< 64 > temp_C()
Definition: wt_rlmn.hpp:63
static C_bf_rank_type init_C_bf_rank(const C_type &, uint64_t)
Definition: wt_rlmn.hpp:67
static C_type init_C(C_type &C, uint64_t)
Definition: wt_rlmn.hpp:65
static C_type init_C(std::map< uint64_t, uint64_t > &C, uint64_t size)
Definition: wt_rlmn.hpp:41
static C_bf_rank_type init_C_bf_rank(const C_type &C, uint64_t size)
Definition: wt_rlmn.hpp:47
int_vector C_bf_rank_type
Definition: wt_rlmn.hpp:37
static std::map< uint64_t, uint64_t > temp_C()
Definition: wt_rlmn.hpp:39
int_vector C_type
Definition: wt_rlmn.hpp:36
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.