SDSL  3.0.0
Succinct Data Structure Library
wt_ap.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
9 #ifndef INCLUDED_SDSL_WT_AP
10 #define INCLUDED_SDSL_WT_AP
11 
12 #include <sdsl/bit_vectors.hpp>
13 #include <sdsl/int_vector.hpp>
14 #include <sdsl/vectors.hpp>
15 #include <sdsl/wm_int.hpp>
16 #include <sdsl/wt_huff.hpp>
17 
19 namespace sdsl
20 {
21 
23 
36 template <class t_wt_byte = wt_huff<bit_vector, rank_support_v5<>>, class t_wt_int = wm_int<>>
37 class wt_ap
38 {
39  static_assert(std::is_same<typename index_tag<t_wt_byte>::type, wt_tag>::value,
40  "First template argument has to be a wavelet tree.");
41  static_assert(std::is_same<typename index_tag<t_wt_int>::type, wt_tag>::value,
42  "Second template argument has to be a wavelet tree.");
43 
44  public:
50  typedef t_wt_byte wt_byte_type;
51  typedef t_wt_int wt_int_type;
54  enum
55  {
56  lex_ordered = 0
57  };
58 
59  protected:
61  value_type m_sigma = 0; //<- \f$ |\Sigma| \f$
66  std::vector<wt_int_type> m_offset;
67 
68  private:
69  // retrieves a character's class and offset - if the character exists in the text
70  inline std::tuple<bool, value_type, value_type> try_get_char_class_offset(value_type c) const
71  {
72  if (c >= m_char2class.size())
73  { // c is greater than any symbol in text
74  return std::make_tuple(false, 0, 0);
75  }
76  auto offset_class = m_char2class.inverse_select(c);
77  if (offset_class.second == m_class_cnt)
78  { // c never occurs in text
79  return std::make_tuple(false, 0, 0);
80  }
81  return std::make_tuple(true, offset_class.second, offset_class.first);
82  }
83 
84  public:
85  const size_type & sigma = m_sigma;
86 
88  wt_ap() {}
89 
91 
96  template <typename t_it>
97  wt_ap(t_it begin, t_it end, std::string tmp_dir = ram_file_name(""))
98  : m_size(std::distance(begin, end))
99  {
100  const uint8_t wt_byte_width = wt_byte_type::alphabet_category::WIDTH;
101  const uint8_t wt_int_width = wt_int_type::alphabet_category::WIDTH;
102 
103  // calculate effective sigma and character frequencies
104  value_type max_symbol = 0;
105  std::vector<std::pair<size_type, value_type>> char_freq;
106  value_type pseudo_entries = 0;
107  {
108  auto event = memory_monitor::event("char freq");
109  for (auto it = begin; it != end; ++it)
110  {
111  value_type element = *it;
112  while (element >= max_symbol)
113  {
114  char_freq.emplace_back(0, max_symbol);
115  max_symbol++;
116  pseudo_entries++;
117  }
118  if (char_freq[element].first == 0) { pseudo_entries--; }
119  char_freq[element].first++;
120  }
121  std::sort(char_freq.rbegin(), char_freq.rend());
122  m_sigma = max_symbol - pseudo_entries;
123  }
124 
125  m_singleton_class_cnt = std::min(max_symbol, (value_type)bits::hi(m_sigma));
127 
128  std::vector<std::pair<std::string, int_vector_buffer<wt_int_width>>> temp_file_offset_buffers;
129 
130  // assign character classes
131  int_vector<wt_byte_width> m_char2class_buffer(max_symbol, m_class_cnt, bits::hi(m_class_cnt + 1) + 1);
132  for (value_type i = 0; i < m_singleton_class_cnt; ++i) { m_char2class_buffer[char_freq[i].second] = i; }
133  value_type current_symbol = m_singleton_class_cnt;
134  value_type class_size = 1;
135  {
136  auto event = memory_monitor::event("char2class");
137  for (value_type i = m_singleton_class_cnt; i < m_class_cnt; ++i)
138  {
139  class_size <<= 1;
140  value_type offset = 0;
141  for (; offset < class_size && current_symbol < m_sigma; ++offset, ++current_symbol)
142  {
143  m_char2class_buffer[char_freq[current_symbol].second] = i;
144  }
145 
146  std::string temp_file_offset = tmp_dir + "_wt_ap_offset_" + util::to_string(i - m_singleton_class_cnt) +
148  temp_file_offset_buffers.emplace_back(temp_file_offset,
149  int_vector_buffer<wt_int_width>(temp_file_offset,
150  std::ios::out,
151  1024 * 1024,
152  bits::hi(offset) + 1));
153  }
154  char_freq.clear();
155  construct_im(m_char2class, m_char2class_buffer);
156  }
157 
158  // calculate text-order classes and offsets
159  std::string temp_file_class = tmp_dir + "_wt_ap_class_" + util::to_string(util::pid()) + "_" +
161  int_vector_buffer<wt_byte_width> class_buffer(temp_file_class,
162  std::ios::out,
163  1024 * 1024,
164  bits::hi(m_class_cnt) + 1);
165  {
166  auto event = memory_monitor::event("write class and offset");
167  for (auto it = begin; it != end; ++it)
168  {
169  value_type ch = *it;
170  value_type cl = m_char2class_buffer[ch];
171  class_buffer.push_back(cl);
172  if (cl >= m_singleton_class_cnt)
173  {
174  value_type offset = m_char2class.rank(ch, cl);
175  cl -= m_singleton_class_cnt;
176  temp_file_offset_buffers[cl].second.push_back(offset);
177  }
178  }
179  class_buffer.close();
180  }
181 
182  {
183  auto event = memory_monitor::event("class WT");
184  int_vector_buffer<wt_byte_width> class_buffer(temp_file_class);
185  m_class = wt_byte_type(class_buffer.begin(), class_buffer.end(), tmp_dir);
186  }
187  sdsl::remove(temp_file_class);
188  {
189  auto event = memory_monitor::event("offset WTs");
191  for (value_type i = 0; i < m_class_cnt - m_singleton_class_cnt; ++i)
192  {
193  auto & temp_file_offset_buffer = temp_file_offset_buffers[i];
194  temp_file_offset_buffer.second.close();
195  {
196  int_vector_buffer<wt_int_width> offset_buffer(temp_file_offset_buffer.first);
197  m_offset[i] = wt_int_type(offset_buffer.begin(), offset_buffer.end(), tmp_dir);
198  }
199  sdsl::remove(temp_file_offset_buffer.first);
200  }
201  }
202  }
203 
205  wt_ap(const wt_ap & wt)
206  : m_size(wt.m_size)
207  , m_sigma(wt.m_sigma)
211  , m_class(wt.m_class)
212  , m_offset(wt.m_offset)
213  {}
214 
216  wt_ap(wt_ap && wt) { *this = std::move(wt); }
217 
219  wt_ap & operator=(const wt_ap & wt)
220  {
221  if (this != &wt)
222  {
223  wt_ap tmp(wt);
224  *this = std::move(tmp);
225  }
226  return *this;
227  }
228 
231  {
232  if (this != &wt)
233  {
234  m_size = wt.m_size;
235  m_sigma = wt.m_sigma;
236  m_singleton_class_cnt = wt.m_singleton_class_cnt;
237  m_class_cnt = wt.m_class_cnt;
238  m_char2class = std::move(wt.m_char2class);
239  m_class = std::move(wt.m_class);
240  m_offset = std::move(wt.m_offset);
241  }
242  return *this;
243  }
244 
246  size_type size() const { return m_size; }
247 
249  bool empty() const { return m_size == 0; }
250 
252 
262  {
263  assert(i < size());
264  auto textoffset_class = m_class.inverse_select(i);
265  auto cl = textoffset_class.second;
266  value_type offset = cl < m_singleton_class_cnt ? 0
267  : m_offset[cl - m_singleton_class_cnt][textoffset_class.first];
268  return m_char2class.select(offset + 1, cl);
269  };
270 
272 
284  {
285  assert(i <= size());
286  auto success_class_offset = try_get_char_class_offset(c);
287  if (!std::get<0>(success_class_offset)) { return 0; }
288  auto cl = std::get<1>(success_class_offset);
289  auto offset = std::get<2>(success_class_offset);
290  size_type count = m_class.rank(i, cl);
291  return cl < m_singleton_class_cnt ? count : m_offset[cl - m_singleton_class_cnt].rank(count, offset);
292  };
293 
295 
301  std::pair<size_type, value_type> inverse_select(size_type i) const
302  {
303  assert(i < size());
304 
305  auto textoffset_class = m_class.inverse_select(i);
306  auto textoffset = textoffset_class.first;
307  auto cl = textoffset_class.second;
308  if (cl < m_singleton_class_cnt) { return std::make_pair(textoffset, m_char2class.select(1, cl)); }
309  auto class_result = m_offset[cl - m_singleton_class_cnt].inverse_select(textoffset);
310  return std::make_pair(class_result.first, m_char2class.select(class_result.second + 1, cl));
311  }
312 
314 
325  {
326  assert(1 <= i and i <= rank(size(), c));
327  auto success_class_offset = try_get_char_class_offset(c);
328  if (!std::get<0>(success_class_offset)) { return m_size; }
329  auto cl = std::get<1>(success_class_offset);
330  auto offset = std::get<2>(success_class_offset);
331  size_type text_offset = cl < m_singleton_class_cnt ? i
332  : 1 + m_offset[cl - m_singleton_class_cnt].select(i, offset);
333  return m_class.select(text_offset, cl);
334  };
335 
337  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
338  {
339  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
340  size_type written_bytes = 0;
341  written_bytes += write_member(m_size, out, child, "size");
342  written_bytes += write_member(m_sigma, out, child, "sigma");
343  written_bytes += write_member(m_singleton_class_cnt, out, child, "singleton_classes");
344  written_bytes += write_member(m_class_cnt, out, child, "classes");
345  written_bytes += m_char2class.serialize(out, child, "char2class");
346  written_bytes += m_class.serialize(out, child, "class");
347  for (value_type i = 0; i < m_offset.size(); ++i)
348  {
349  written_bytes += m_offset[i].serialize(out, child, "offset");
350  }
351  structure_tree::add_size(child, written_bytes);
352  return written_bytes;
353  }
354 
356  void load(std::istream & in)
357  {
358  read_member(m_size, in);
359  read_member(m_sigma, in);
362  m_char2class.load(in);
363  m_class.load(in);
365  m_offset.resize(offset_size);
366  for (value_type i = 0; i < offset_size; ++i) { m_offset[i].load(in); }
367  }
368 
370  template <typename archive_t>
371  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
372  {
373  ar(CEREAL_NVP(m_size));
374  ar(CEREAL_NVP(m_sigma));
376  ar(CEREAL_NVP(m_class_cnt));
378  ar(CEREAL_NVP(m_class));
379  ar(CEREAL_NVP(m_offset));
380  }
381 
383  template <typename archive_t>
384  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
385  {
386  ar(CEREAL_NVP(m_size));
387  ar(CEREAL_NVP(m_sigma));
389  ar(CEREAL_NVP(m_class_cnt));
391  ar(CEREAL_NVP(m_class));
392  ar(CEREAL_NVP(m_offset));
393  }
394 
395  iterator begin() { return { this, 0 }; };
396  const_iterator end() { return { this, size() }; };
397  iterator begin() const { return { this, 0 }; };
398  const_iterator end() const { return { this, size() }; };
399 
401  bool operator==(wt_ap const & other) const noexcept
402  {
403  return (m_size == other.m_size) && (m_sigma == other.m_sigma) &&
404  (m_singleton_class_cnt == other.m_singleton_class_cnt) && (m_class_cnt == other.m_class_cnt) &&
405  (m_char2class == other.m_char2class) && (m_class == other.m_class) && (m_offset == other.m_offset);
406  }
407 
409  bool operator!=(wt_ap const & other) const noexcept { return !(*this == other); }
410 };
411 
412 } // end namespace sdsl
413 #endif
bit_vectors.hpp contains classes for uncompressed and compressed bit vector representations.
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
void close(bool remove_file=false)
Close the int_vector_buffer.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
A generic vector class for integers of width .
Definition: int_vector.hpp:253
static mm_event_proxy event(const std::string &name)
Generic iterator for a random access container.
Definition: iterators.hpp:24
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
A wavelet tree class for integer sequences.
Definition: wt_ap.hpp:38
std::vector< wt_int_type > m_offset
Definition: wt_ap.hpp:66
wt_byte_type m_class
Definition: wt_ap.hpp:65
wt_ap & operator=(const wt_ap &wt)
Assignment operator.
Definition: wt_ap.hpp:219
t_wt_int wt_int_type
Definition: wt_ap.hpp:51
iterator begin()
Definition: wt_ap.hpp:395
const_iterator end() const
Definition: wt_ap.hpp:398
bool operator!=(wt_ap const &other) const noexcept
Inequality operator.
Definition: wt_ap.hpp:409
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition: wt_ap.hpp:371
bool operator==(wt_ap const &other) const noexcept
Equality operator.
Definition: wt_ap.hpp:401
wt_ap(const wt_ap &wt)
Copy constructor.
Definition: wt_ap.hpp:205
wt_tag index_category
Definition: wt_ap.hpp:52
size_type m_size
Definition: wt_ap.hpp:60
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Load via cereal.
Definition: wt_ap.hpp:384
const size_type & sigma
Definition: wt_ap.hpp:85
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many occurrences of symbol wt[i] are in the prefix [0..i-1] of the original sequence.
Definition: wt_ap.hpp:301
wt_ap(wt_ap &&wt)
Move copy constructor.
Definition: wt_ap.hpp:216
const_iterator end()
Definition: wt_ap.hpp:396
wt_ap & operator=(wt_ap &&wt)
Assignment move operator.
Definition: wt_ap.hpp:230
size_type select(size_type i, value_type c) const
Calculates the i-th occurrence of the symbol c in the supported vector.
Definition: wt_ap.hpp:324
@ lex_ordered
Definition: wt_ap.hpp:56
value_type m_singleton_class_cnt
Definition: wt_ap.hpp:62
wt_byte_type m_char2class
Definition: wt_ap.hpp:64
wt_ap(t_it begin, t_it end, std::string tmp_dir=ram_file_name(""))
Construct the wavelet tree from a sequence defined by two interators.
Definition: wt_ap.hpp:97
bool empty() const
Returns whether the wavelet tree contains no data.
Definition: wt_ap.hpp:249
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1] of the supported vector.
Definition: wt_ap.hpp:283
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_ap.hpp:356
int_vector ::difference_type difference_type
Definition: wt_ap.hpp:47
int_vector ::size_type size_type
Definition: wt_ap.hpp:40
iterator begin() const
Definition: wt_ap.hpp:397
value_type operator[](size_type i) const
Recovers the i-th symbol of the original vector.
Definition: wt_ap.hpp:261
int_alphabet_tag alphabet_category
Definition: wt_ap.hpp:53
wt_ap()
Default constructor.
Definition: wt_ap.hpp:88
t_wt_byte wt_byte_type
Definition: wt_ap.hpp:50
const_iterator iterator
Definition: wt_ap.hpp:49
value_type m_class_cnt
Definition: wt_ap.hpp:63
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_ap.hpp:337
int_vector ::value_type value_type
Definition: wt_ap.hpp:46
random_access_const_iterator< wt_ap > const_iterator
Definition: wt_ap.hpp:48
value_type m_sigma
Definition: wt_ap.hpp:61
size_type size() const
Returns the size of the original vector.
Definition: wt_ap.hpp:246
int_vector.hpp contains the sdsl::int_vector class.
uint64_t id()
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
uint64_t count(const t_k2_treap &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
std::string ram_file_name(const std::string &file)
Returns the corresponding RAM-file name for file.
Definition: ram_fs.hpp:174
size_t write_member(const T &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition: io.hpp:84
void read_member(T &t, std::istream &in)
Definition: io.hpp:111
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
void construct_im(t_index &idx, t_data &&data, uint8_t num_bytes=0)
Definition: construct.hpp:58
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
wm_int.hpp contains a specialized class for a wavelet tree for sequences over large alphabets.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.