SDSL  3.0.0
Succinct Data Structure Library
csa_bitcompressed.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
8 #ifndef INCLUDED_SDSL_CSA_UNCOMPRESSED
9 #define INCLUDED_SDSL_CSA_UNCOMPRESSED
10 
11 #include <algorithm>
12 #include <iomanip>
13 #include <iostream>
14 #include <iterator>
15 #include <string>
16 
19 #include <sdsl/int_vector.hpp>
20 #include <sdsl/iterators.hpp>
21 #include <sdsl/sdsl_concepts.hpp>
23 #include <sdsl/util.hpp>
24 
25 namespace sdsl
26 {
27 
29 
44 template <class t_alphabet_strat = byte_alphabet>
46 {
47  friend class bwt_of_csa_psi<csa_bitcompressed>;
48 
49  public:
50  typedef uint64_t value_type; // STL Container requirement
52  typedef const_iterator iterator; // STL Container requirement
53  typedef const value_type const_reference;
56  typedef const pointer const_pointer;
57  typedef int_vector<>::size_type size_type; // STL Container requirement
59  typedef ptrdiff_t difference_type; // STL Container requirement
68  typedef t_alphabet_strat alphabet_type;
69  typedef typename alphabet_type::char_type char_type; // Note: This is the char type of the CSA not the WT!
70  typedef typename alphabet_type::comp_char_type comp_char_type;
71  typedef typename alphabet_type::string_type string_type;
72  typedef typename alphabet_type::alphabet_category alphabet_category;
74 
77 
78  enum
79  {
81  isa_sample_dens = 1
82  };
83 
84  private:
85  sa_sample_type m_sa; // vector for suffix array values
86  isa_sample_type m_isa; // vector for inverse suffix array values
87  alphabet_type m_alphabet;
88 
89  public:
90  const typename alphabet_type::char2comp_type & char2comp = m_alphabet.char2comp;
91  const typename alphabet_type::comp2char_type & comp2char = m_alphabet.comp2char;
92  const typename alphabet_type::C_type & C = m_alphabet.C;
93  const typename alphabet_type::sigma_type & sigma = m_alphabet.sigma;
94  const psi_type psi = psi_type(*this);
95  const lf_type lf = lf_type(*this);
96  const bwt_type bwt = bwt_type(*this);
97  const bwt_type L = bwt_type(*this);
98  const isa_type & isa = m_isa;
100  const text_type text = text_type(*this);
101  const sa_sample_type & sa_sample = m_sa;
102  const isa_sample_type & isa_sample = m_isa;
103 
108  : m_sa(csa.m_sa)
109  , m_isa(csa.m_isa)
110  , m_alphabet(csa.m_alphabet)
111  {}
112 
114  csa_bitcompressed(csa_bitcompressed && csa) { *this = std::move(csa); }
115 
118  {
119  std::string text_file = cache_file_name(key_text<alphabet_type::int_width>(), config);
122  size_type n = text_buf.size();
123 
124  m_alphabet = alphabet_type(text_buf, n);
125  m_sa = sa_sample_type(config);
126  m_isa = isa_sample_type(config);
127  }
128 
130 
133  size_type size() const { return m_sa.size(); }
134 
136 
140 
142 
145  bool empty() const { return m_sa.empty(); }
146 
148 
151  const_iterator begin() const { return const_iterator(this, 0); }
152 
154 
157  const_iterator end() const { return const_iterator(this, size()); }
158 
160 
164  inline value_type operator[](size_type i) const { return m_sa[i]; }
165 
167 
171  {
172  if (this != &csa)
173  {
174  csa_bitcompressed tmp(csa);
175  *this = std::move(tmp);
176  }
177  return *this;
178  }
179 
181 
185  {
186  if (this != &csa)
187  {
188  m_sa = std::move(csa.m_sa);
189  m_isa = std::move(csa.m_isa);
190  m_alphabet = std::move(csa.m_alphabet);
191  }
192  return *this;
193  }
194 
196  bool operator==(csa_bitcompressed const & other) const noexcept
197  {
198  return (m_sa == other.m_sa) && (m_isa == other.m_isa) && (m_alphabet == other.m_alphabet);
199  }
200 
202  bool operator!=(csa_bitcompressed const & other) const noexcept { return !(*this == other); }
203 
205 
208  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
209  {
210  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
211  size_type written_bytes = 0;
212  written_bytes += m_sa.serialize(out, child, "m_sa");
213  written_bytes += m_isa.serialize(out, child, "m_isa");
214  written_bytes += m_alphabet.serialize(out, child, "m_alphabet");
215  structure_tree::add_size(child, written_bytes);
216  return written_bytes;
217  }
218 
219  void load(std::istream & in)
220  {
221  m_sa.load(in);
222  m_isa.load(in);
223  m_alphabet.load(in);
224  }
225 
226  template <typename archive_t>
227  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
228  {
229  ar(CEREAL_NVP(m_sa));
230  ar(CEREAL_NVP(m_isa));
231  ar(CEREAL_NVP(m_alphabet));
232  }
233 
234  template <typename archive_t>
235  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
236  {
237  ar(CEREAL_NVP(m_sa));
238  ar(CEREAL_NVP(m_isa));
239  ar(CEREAL_NVP(m_alphabet));
240  }
241 
242  size_type get_sample_dens() const { return 1; }
243 
244  private:
245  // Calculates how many symbols c are in the prefix [0..i-1] of the BWT of the original text.
246  /*
247  * \param i The exclusive index of the prefix range [0..i-1], so \f$i\in [0..size()]\f$.
248  * \param c The symbol to count the occurrences in the prefix.
249  * \returns The number of occurrences of symbol c in the prefix [0..i-1] of the BWT.
250  * \par Time complexity
251  * \f$ \Order{\log n} \f$
252  */
253  size_type rank_bwt(size_type i, const char_type c) const
254  {
255  // TODO: special case if c == BWT[i-1] we can use LF to get a constant time answer
256  comp_char_type cc = char2comp[c];
257  if (cc == 0 and c != 0) // character is not in the text => return 0
258  return 0;
259  // binary search the interval [C[cc]..C[cc+1]-1] for the result
260  size_type lower_b = C[cc],
261  upper_b = C[((size_type)1) + cc]; // lower_b inclusive, upper_b exclusive
262  while (lower_b + 1 < upper_b)
263  {
264  size_type mid = (lower_b + upper_b) / 2;
265  if (psi[mid] >= i)
266  upper_b = mid;
267  else
268  lower_b = mid;
269  }
270  if (lower_b > C[cc])
271  return lower_b - C[cc] + 1;
272  else
273  { // lower_b == m_C[cc]
274  return psi[lower_b] < i; // 1 if psi[lower_b]<i, 0 otherwise
275  }
276  }
277 
278  // Calculates the i-th occurrence of symbol c in the BWT of the original text.
279  /*
280  * \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
281  * \param c Character c.
282  * \returns The i-th occurrence of c in the BWT or size() if c does
283  * not occur t times in BWT>
284  * \par Time complexity
285  * \f$ \Order{t_{\Psi}} \f$
286  */
287  size_type select_bwt(size_type i, const char_type c) const
288  {
289  comp_char_type cc = char2comp[c];
290  if (cc == 0 and c != 0) // character is not in the text => return size()
291  return size();
292  if (C[cc] + i - 1 < C[((size_type)1) + cc]) { return psi[C[cc] + i - 1]; }
293  return size();
294  }
295 };
296 
297 } // end namespace sdsl
298 #endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
A wrapper for the bwt of a compressed suffix array that is based on the function.
A class for the uncompressed suffix array (SA).
bwt_of_csa_psi< csa_bitcompressed > bwt_type
csa_bitcompressed & operator=(csa_bitcompressed &&csa)
Assignment Move Operator.
csa_bitcompressed()
Default constructor.
const alphabet_type::char2comp_type & char2comp
bool operator==(csa_bitcompressed const &other) const noexcept
Equality operator.
traverse_csa_saisa< csa_bitcompressed, false > lf_type
const alphabet_type::C_type & C
const alphabet_type::comp2char_type & comp2char
_isa_sampling< csa_bitcompressed, 0 > isa_sample_type
const sa_sample_type & sa_sample
void load(std::istream &in)
_sa_order_sampling< csa_bitcompressed, 0 > sa_sample_type
bool empty() const
Returns if the data structure is empty.
size_type get_sample_dens() const
value_type operator[](size_type i) const
[]-operator
csa_bitcompressed(const csa_bitcompressed &csa)
Copy constructor.
csa_bitcompressed & operator=(const csa_bitcompressed &csa)
Assignment Operator.
const alphabet_type::sigma_type & sigma
bool operator!=(csa_bitcompressed const &other) const noexcept
Inequality operator.
size_type size() const
Number of elements in the instance.
csa_bitcompressed csa_type
alphabet_type::string_type string_type
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
csa_bitcompressed(csa_bitcompressed &&csa)
Move constructor.
csa_bitcompressed(cache_config &config)
Constructor.
random_access_const_iterator< csa_bitcompressed > const_iterator
traverse_csa_saisa< csa_bitcompressed, true > psi_type
const first_row_type F
text_of_csa< csa_bitcompressed > text_type
const isa_sample_type & isa_sample
alphabet_type::char_type char_type
const_iterator begin() const
Returns a const_iterator to the first element.
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
const_iterator end() const
Returns a const_iterator to the element after the last element.
const value_type const_reference
static size_type max_size()
Returns the largest size that csa_bitcompressed can ever have.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
alphabet_type::comp_char_type comp_char_type
t_alphabet_strat alphabet_type
first_row_of_csa< csa_bitcompressed > first_row_type
int_vector ::size_type size_type
alphabet_type::alphabet_category alphabet_category
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition: int_vector.hpp:253
static size_type max_size() noexcept
Maximum size of the int_vector.
Definition: int_vector.hpp:566
Generic iterator for a random access container.
Definition: iterators.hpp:24
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
A helper class for the function for (compressed) suffix arrays which provide also the inverse suffix...
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
csa_sampling_strategy.hpp includes different strategy classes for suffix array sampling in the CSAs.
int_vector.hpp contains the sdsl::int_vector class.
iterators.hpp contains an generic iterator for random access containers.
constexpr char KEY_SA[]
Definition: config.hpp:37
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
Contains declarations and definitions of data structure concepts.
Helper class for construction process.
Definition: config.hpp:67
suffix_array_helper.hpp contains some helper classes for CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.