SDSL  3.0.0
Succinct Data Structure Library
csa_wt.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
8 #ifndef INCLUDED_SDSL_CSA_WT
9 #define INCLUDED_SDSL_CSA_WT
10 
11 #include <algorithm> // for std::swap
12 #include <cassert>
13 #include <cstring> // for strlen
14 #include <iomanip>
15 #include <iostream>
16 #include <iterator>
17 
20 #include <sdsl/fast_cache.hpp>
21 #include <sdsl/iterators.hpp>
23 #include <sdsl/util.hpp>
24 #include <sdsl/wavelet_trees.hpp>
25 
26 namespace sdsl
27 {
28 
29 template <class t_csa>
30 class psi_of_csa_wt; // forward declaration of PSI-array class
31 
32 template <class t_csa>
33 class bwt_of_csa_wt; // forward declaration of BWT-array class
34 
37 
48 template <class t_wt = wt_huff<>, // Wavelet tree type
49  uint32_t t_dens = 32, // Sample density for suffix array (SA) values
50  uint32_t t_inv_dens = 64, // Sample density for inverse suffix array (ISA) values
51  class t_sa_sample_strat = sa_order_sa_sampling<>, // Policy class for the SA sampling.
52  class t_isa_sample_strat = isa_sampling<>, // Policy class for ISA sampling.
53  class t_alphabet_strat = // Policy class for the representation of the alphabet.
54  typename wt_alphabet_trait<t_wt>::type>
55 class csa_wt
56 {
57  static_assert(std::is_same<typename index_tag<t_wt>::type, wt_tag>::value,
58  "First template argument has to be a wavelet tree type.");
59  static_assert(t_dens > 0, "Second template argument has to be greater then 0.");
60  static_assert(t_inv_dens > 0, "Third template argument has to be greater then 0.");
61  static_assert(std::is_same<typename sampling_tag<t_sa_sample_strat>::type, sa_sampling_tag>::value,
62  "Forth template argument has to be a suffix array sampling strategy.");
63  static_assert(std::is_same<typename sampling_tag<t_isa_sample_strat>::type, isa_sampling_tag>::value,
64  "Fifth template argument has to be a inverse suffix array sampling strategy.");
65  static_assert(is_alphabet<t_alphabet_strat>::value, "Sixth template argument has to be a alphabet strategy.");
66 
67  friend class bwt_of_csa_wt<csa_wt>;
68 
69  public:
70  enum
71  {
72  sa_sample_dens = t_dens,
73  isa_sample_dens = t_inv_dens
74  };
75 
76  typedef uint64_t value_type;
79  typedef const value_type const_reference;
82  typedef const pointer const_pointer;
85  typedef ptrdiff_t difference_type;
92  typedef t_wt wavelet_tree_type;
93  typedef typename t_sa_sample_strat::template type<csa_wt> sa_sample_type;
94  typedef typename t_isa_sample_strat::template type<csa_wt> isa_sample_type;
95  typedef t_alphabet_strat alphabet_type;
96  typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
97  typedef typename alphabet_type::comp_char_type comp_char_type;
98  typedef typename alphabet_type::string_type string_type;
99  typedef csa_wt csa_type;
100 
103  typedef typename alphabet_type::alphabet_category alphabet_category;
104 
105  private:
106  t_wt m_wavelet_tree; // the wavelet tree
107  sa_sample_type m_sa_sample; // suffix array samples
108  isa_sample_type m_isa_sample; // inverse suffix array samples
109  alphabet_type m_alphabet;
110 //#define USE_CSA_CACHE
111 #ifdef USE_CSA_CACHE
112  mutable fast_cache csa_cache;
113 #endif
114 
115  public:
116  const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
117  const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
118  const typename alphabet_type::C_type & C = m_alphabet.C;
119  const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
120  const psi_type psi = psi_type(*this);
121  const lf_type lf = lf_type(*this);
122  const bwt_type bwt = bwt_type(*this);
123  const text_type text = text_type(*this);
125  const bwt_type L = bwt_type(*this);
126  const isa_type isa = isa_type(*this);
127  const sa_sample_type & sa_sample = m_sa_sample;
128  const isa_sample_type & isa_sample = m_isa_sample;
129  const wavelet_tree_type & wavelet_tree = m_wavelet_tree;
130 
132  csa_wt() = default;
133 
135  csa_wt(const csa_wt & csa)
136  : m_wavelet_tree(csa.m_wavelet_tree)
137  , m_sa_sample(csa.m_sa_sample)
138  , m_isa_sample(csa.m_isa_sample)
139  , m_alphabet(csa.m_alphabet)
140  {
141  m_isa_sample.set_vector(&m_sa_sample);
142  }
143 
145  csa_wt(csa_wt && csa)
146  : m_wavelet_tree(std::move(csa.m_wavelet_tree))
147  , m_sa_sample(std::move(csa.m_sa_sample))
148  , m_isa_sample(std::move(csa.m_isa_sample))
149  , m_alphabet(std::move(csa.m_alphabet))
150  {
151  m_isa_sample.set_vector(&m_sa_sample);
152  }
153 
155  csa_wt(cache_config & config);
156 
158 
163  size_type size() const { return m_wavelet_tree.size(); }
164 
166 
169  static size_type max_size() { return bit_vector::max_size(); }
170 
172 
175  bool empty() const { return m_wavelet_tree.empty(); }
176 
178 
181  const_iterator begin() const { return const_iterator(this, 0); }
182 
184 
187  const_iterator end() const { return const_iterator(this, size()); }
188 
190 
196  inline value_type operator[](size_type i) const;
197 
199 
202  csa_wt & operator=(const csa_wt & csa)
203  {
204  if (this != &csa)
205  {
206  csa_wt tmp(csa);
207  *this = std::move(tmp);
208  }
209  return *this;
210  }
211 
213 
217  {
218  if (this != &csa)
219  {
220  m_wavelet_tree = std::move(csa.m_wavelet_tree);
221  m_sa_sample = std::move(csa.m_sa_sample);
222  m_isa_sample = std::move(csa.m_isa_sample);
223  m_isa_sample.set_vector(&m_sa_sample);
224  m_alphabet = std::move(csa.m_alphabet);
225  }
226  return *this;
227  }
228 
230  bool operator==(csa_wt const & other) const noexcept
231  {
232  return (m_wavelet_tree == other.m_wavelet_tree) && (m_sa_sample == other.m_sa_sample) &&
233  (m_isa_sample == other.m_isa_sample) && (m_alphabet == other.m_alphabet);
234  }
235 
237  bool operator!=(csa_wt const & other) const noexcept { return !(*this == other); }
238 
240 
243  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const;
244 
246 
248  void load(std::istream & in);
249 
251  template <typename archive_t>
252  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const;
253 
255  template <typename archive_t>
256  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar);
257 
258  private:
259  // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
260  /*
261  * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
262  * \param c The symbol to count the occurrences in the prefix.
263  * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
264  * \par Time complexity
265  * \f$ \Order{\log |\Sigma|} \f$
266  */
267  size_type rank_bwt(size_type i, const char_type c) const { return m_wavelet_tree.rank(i, c); }
268 
269  // Calculates the position of the i-th c in the BWT of the original text.
270  /*
271  * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
272  * \param c Symbol c.
273  * \returns The position of the i-th c in the BWT or size() if c does occur less then i times.
274  * \par Time complexity
275  * \f$ \Order{t_{\Psi}} \f$
276  */
277  size_type select_bwt(size_type i, const char_type c) const
278  {
279  assert(i > 0);
280  char_type cc = char2comp[c];
281  if (cc == 0 and c != 0) // character is not in the text => return size()
282  return size();
283  assert(cc != 255);
284  if (C[cc] + i - 1 < C[cc + 1]) { return m_wavelet_tree.select(i, c); }
285  else
286  return size();
287  }
288 };
289 
290 // == template functions ==
291 
292 template <class t_wt,
293  uint32_t t_dens,
294  uint32_t t_inv_dens,
295  class t_sa_sample_strat,
296  class t_isa,
297  class t_alphabet_strat>
299 {
300  if (!cache_file_exists(key_bwt<alphabet_type::int_width>(), config)) { return; }
301  {
302  auto event = memory_monitor::event("construct csa-alpbabet");
304  cache_file_name(key_bwt<alphabet_type::int_width>(), config));
305  size_type n = bwt_buf.size();
306  m_alphabet = alphabet_type(bwt_buf, n);
307  }
308  {
309  auto event = memory_monitor::event("sample SA");
310  m_sa_sample = sa_sample_type(config);
311  }
312  {
313  auto event = memory_monitor::event("sample ISA");
314  isa_sample_type isa_s(config, &m_sa_sample);
315  util::swap_support(m_isa_sample, isa_s, &m_sa_sample, &m_sa_sample);
316  }
317 
318  // if ( config.delete_files ) {
319  // remove_from_cache<int_vector<>>(conf::KEY_SA, config);
320  // }
321  {
322  auto event = memory_monitor::event("construct wavelet tree");
324  cache_file_name(key_bwt<alphabet_type::int_width>(), config));
325  m_wavelet_tree = wavelet_tree_type(bwt_buf.begin(), bwt_buf.end(), config.dir);
326  }
327 }
328 
329 template <class t_wt,
330  uint32_t t_dens,
331  uint32_t t_inv_dens,
332  class t_sa_sample_strat,
333  class t_isa,
334  class t_alphabet_strat>
336  -> value_type
337 {
338  size_type off = 0;
339  while (!m_sa_sample.is_sampled(i))
340  {
341  i = lf[i];
342  ++off;
343  }
344  value_type result = m_sa_sample[i];
345  if (result + off < size()) { return result + off; }
346  else
347  {
348  return result + off - size();
349  }
350 }
351 
352 template <class t_wt,
353  uint32_t t_dens,
354  uint32_t t_inv_dens,
355  class t_sa_sample_strat,
356  class t_isa,
357  class t_alphabet_strat>
360  std::string name) const
361  -> size_type
362 {
363  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
364  size_type written_bytes = 0;
365  written_bytes += m_wavelet_tree.serialize(out, child, "wavelet_tree");
366  written_bytes += m_sa_sample.serialize(out, child, "sa_samples");
367  written_bytes += m_isa_sample.serialize(out, child, "isa_samples");
368  written_bytes += m_alphabet.serialize(out, child, "alphabet");
369  structure_tree::add_size(child, written_bytes);
370  return written_bytes;
371 }
372 
373 template <class t_wt,
374  uint32_t t_dens,
375  uint32_t t_inv_dens,
376  class t_sa_sample_strat,
377  class t_isa,
378  class t_alphabet_strat>
380 {
381  m_wavelet_tree.load(in);
382  m_sa_sample.load(in);
383  m_isa_sample.load(in, &m_sa_sample);
384  m_alphabet.load(in);
385 }
386 
387 template <class t_wt,
388  uint32_t t_dens,
389  uint32_t t_inv_dens,
390  class t_sa_sample_strat,
391  class t_isa,
392  class t_alphabet_strat>
393 template <typename archive_t>
395  archive_t & ar) const
396 {
397  ar(CEREAL_NVP(m_wavelet_tree));
398  ar(CEREAL_NVP(m_sa_sample));
399  ar(CEREAL_NVP(m_isa_sample));
400  ar(CEREAL_NVP(m_alphabet));
401 }
402 
403 template <class t_wt,
404  uint32_t t_dens,
405  uint32_t t_inv_dens,
406  class t_sa_sample_strat,
407  class t_isa,
408  class t_alphabet_strat>
409 template <typename archive_t>
411  archive_t & ar)
412 {
413  ar(CEREAL_NVP(m_wavelet_tree));
414  ar(CEREAL_NVP(m_sa_sample));
415  ar(CEREAL_NVP(m_isa_sample));
416  m_isa_sample.set_vector(&m_sa_sample);
417  ar(CEREAL_NVP(m_alphabet));
418 }
419 
420 } // end namespace sdsl
421 #endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
A class for the Compressed Suffix Array (CSA) based on a Wavelet Tree (WT) of the Burrow Wheeler Tran...
Definition: csa_wt.hpp:56
bool empty() const
Returns if the data strucutre is empty.
Definition: csa_wt.hpp:175
random_access_const_iterator< csa_wt > const_iterator
Definition: csa_wt.hpp:77
t_isa_sample_strat::template type< csa_wt > isa_sample_type
Definition: csa_wt.hpp:94
csa_wt(csa_wt &&csa)
Move constructor.
Definition: csa_wt.hpp:145
bool operator!=(csa_wt const &other) const noexcept
Inequality operator.
Definition: csa_wt.hpp:237
ptrdiff_t difference_type
Definition: csa_wt.hpp:85
const_iterator begin() const
Returns a const_iterator to the first element.
Definition: csa_wt.hpp:181
void load(std::istream &in)
Load from a stream.
Definition: csa_wt.hpp:379
const psi_type psi
Definition: csa_wt.hpp:120
const first_row_type F
Definition: csa_wt.hpp:124
csa_tag index_category
Definition: csa_wt.hpp:101
const bwt_type bwt
Definition: csa_wt.hpp:122
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition: csa_wt.hpp:394
const alphabet_type::char2comp_type & char2comp
Definition: csa_wt.hpp:116
const value_type const_reference
Definition: csa_wt.hpp:79
traverse_csa_wt< csa_wt, true > psi_type
Definition: csa_wt.hpp:86
static size_type max_size()
Returns the largest size that csa_wt can ever have.
Definition: csa_wt.hpp:169
const text_type text
Definition: csa_wt.hpp:123
const alphabet_type::sigma_type & sigma
Definition: csa_wt.hpp:119
alphabet_type::string_type string_type
Definition: csa_wt.hpp:98
const_iterator iterator
Definition: csa_wt.hpp:78
bool operator==(csa_wt const &other) const noexcept
Equality operator.
Definition: csa_wt.hpp:230
@ isa_sample_dens
Definition: csa_wt.hpp:73
@ sa_sample_dens
Definition: csa_wt.hpp:72
text_of_csa< csa_wt > text_type
Definition: csa_wt.hpp:91
const sa_sample_type & sa_sample
Definition: csa_wt.hpp:127
t_sa_sample_strat::template type< csa_wt > sa_sample_type
Definition: csa_wt.hpp:93
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
Definition: csa_wt.hpp:358
const_reference * pointer
Definition: csa_wt.hpp:81
csa_wt & operator=(csa_wt &&csa)
Assignment Move Operator.
Definition: csa_wt.hpp:216
size_type csa_size_type
Definition: csa_wt.hpp:84
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Serialise (load) via cereal.
Definition: csa_wt.hpp:410
value_type operator[](size_type i) const
[]-operator
Definition: csa_wt.hpp:335
csa_wt & operator=(const csa_wt &csa)
Assignment Operator.
Definition: csa_wt.hpp:202
const lf_type lf
Definition: csa_wt.hpp:121
bwt_of_csa_wt< csa_wt > bwt_type
Definition: csa_wt.hpp:88
const_reference reference
Definition: csa_wt.hpp:80
const alphabet_type::C_type & C
Definition: csa_wt.hpp:118
alphabet_type::char_type char_type
Definition: csa_wt.hpp:96
size_type size() const
Number of elements in the .
Definition: csa_wt.hpp:163
const wavelet_tree_type & wavelet_tree
Definition: csa_wt.hpp:129
isa_of_csa_wt< csa_wt > isa_type
Definition: csa_wt.hpp:89
csa_wt()=default
Default constructor.
const isa_type isa
Definition: csa_wt.hpp:126
const isa_sample_type & isa_sample
Definition: csa_wt.hpp:128
alphabet_type::comp_char_type comp_char_type
Definition: csa_wt.hpp:97
alphabet_type::alphabet_category alphabet_category
Definition: csa_wt.hpp:103
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition: csa_wt.hpp:187
csa_wt(const csa_wt &csa)
Copy constructor.
Definition: csa_wt.hpp:135
lf_tag extract_category
Definition: csa_wt.hpp:102
int_vector ::size_type size_type
Definition: csa_wt.hpp:83
first_row_of_csa< csa_wt > first_row_type
Definition: csa_wt.hpp:90
t_alphabet_strat alphabet_type
Definition: csa_wt.hpp:95
traverse_csa_wt< csa_wt, false > lf_type
Definition: csa_wt.hpp:87
csa_wt csa_type
Definition: csa_wt.hpp:99
t_wt wavelet_tree_type
Definition: csa_wt.hpp:92
const bwt_type L
Definition: csa_wt.hpp:125
const pointer const_pointer
Definition: csa_wt.hpp:82
uint64_t value_type
Definition: csa_wt.hpp:76
const alphabet_type::comp2char_type & comp2char
Definition: csa_wt.hpp:117
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition: int_vector.hpp:253
static size_type max_size() noexcept
Maximum size of the int_vector.
Definition: int_vector.hpp:566
static mm_event_proxy event(const std::string &name)
Generic iterator for a random access container.
Definition: iterators.hpp:24
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
A wrapper class for the and LF function for (compressed) suffix arrays that are based on a wavelet t...
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
iterators.hpp contains an generic iterator for random access containers.
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
Helper class for construction process.
Definition: config.hpp:67
std::string dir
Definition: config.hpp:71
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wavelet_trees.hpp contains wavelet tree implementations.