SDSL  3.0.0
Succinct Data Structure Library
csa_sampling_strategy.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
9 #ifndef INCLUDED_CSA_SAMPLING_STRATEGY
10 #define INCLUDED_CSA_SAMPLING_STRATEGY
11 
12 /*
13  * Text = ABCDEFABCDEF$
14  * 0123456789012
15  * sa_sample_dens = 2
16  * *1 SA *2
17  * * 12 * $
18  * 06 * ABCDEF$
19  * * 00 * ABCDEFABCDEF$
20  * 07 BCDEF$
21  * * 01 BCDEFABCDEF$
22  * 08 * CDEF$
23  * * 02 * CDEFABCDEF$
24  * 09 DEF$
25  * * 03 DEFABCDEF$
26  * 10 * EF$
27  * * 04 * EFABCDEF$
28  * 11 F$
29  * * 05 FABCDEF$
30  *
31  * The first sampling (*1) is called suffix order sampling. It has the advantage, that
32  * we don't need to store a bitvector, which marks the sampled suffixes, since a suffix
33  * at index \‍(i\‍) in the suffix array is marked if \‍( 0 \equiv i \mod sa_sample_dens \‍).
34  *
35  * The second sampling (*2) is called text order sampling. It is also called regular in [1].
36  *
37  * [1] P.Ferragina, J. Siren, R. Venturini: Distribution-Aware Compressed Full-Text Indexes, ESA 2011
38  */
39 
40 #include <set>
41 #include <tuple>
42 
44 #include <sdsl/int_vector.hpp>
46 #include <sdsl/wavelet_trees.hpp>
47 
48 namespace sdsl
49 {
50 
51 template <class t_csa, uint8_t t_width = 0>
52 class _sa_order_sampling : public int_vector<t_width>
53 {
54  public:
56  typedef typename base_type::size_type size_type; // make typedefs of base_type visible
57  typedef typename base_type::value_type value_type; //
58  enum
59  {
60  sample_dens = t_csa::sa_sample_dens
61  };
62  enum
63  {
64  text_order = false
65  };
67 
70 
72  /*
73  * \param cconfig Cache configuration (SA is expected to be cached.).
74  * \param csa Pointer to the corresponding CSA. Not used in this class.
75  * \par Time complexity
76  * Linear in the size of the suffix array.
77  */
78  _sa_order_sampling(const cache_config & cconfig, SDSL_UNUSED const t_csa * csa = nullptr)
79  {
81  size_type n = sa_buf.size();
82  this->width(bits::hi(n) + 1);
83  this->resize((n + sample_dens - 1) / sample_dens);
84 
85  for (size_type i = 0, cnt_mod = sample_dens, cnt_sum = 0; i < n; ++i, ++cnt_mod)
86  {
87  size_type sa = sa_buf[i];
88  if (sample_dens == cnt_mod)
89  {
90  cnt_mod = 0;
91  base_type::operator[](cnt_sum++) = sa;
92  }
93  }
94  }
95 
97  inline bool is_sampled(size_type i) const { return 0 == (i % sample_dens); }
98 
101 };
102 
103 template <uint8_t t_width = 0>
105 {
106  template <class t_csa>
109 };
110 
111 template <class t_csa, class t_bv = bit_vector, class t_rank = typename t_bv::rank_1_type, uint8_t t_width = 0>
112 class _text_order_sampling : public int_vector<t_width>
113 {
114  private:
115  t_bv m_marked;
116  t_rank m_rank_marked;
117 
118  public:
120  typedef typename base_type::size_type size_type; // make typedefs of base_type visible
121  typedef typename base_type::value_type value_type; //
122  typedef t_bv bv_type;
123  enum
124  {
125  sample_dens = t_csa::sa_sample_dens
126  };
127  enum
128  {
129  text_order = true
130  };
132 
133  const bv_type & marked = m_marked;
134  const t_rank & rank_marked = m_rank_marked;
135 
138 
140  /*
141  * \param cconfig Cache configuration (SA is expected to be cached.).
142  * \param csa Pointer to the corresponding CSA. Not used in this class.
143  * \par Time complexity
144  * Linear in the size of the suffix array.
145  */
146  _text_order_sampling(const cache_config & cconfig, SDSL_UNUSED const t_csa * csa = nullptr)
147  {
149  size_type n = sa_buf.size();
150  bit_vector marked(n, 0); // temporary bitvector for the marked text positions
151  this->width(bits::hi(n / sample_dens) + 1);
152  this->resize((n + sample_dens - 1) / sample_dens);
153 
154  for (size_type i = 0, sa_cnt = 0; i < n; ++i)
155  {
156  size_type sa = sa_buf[i];
157  if (0 == (sa % sample_dens))
158  {
159  marked[i] = 1;
160  base_type::operator[](sa_cnt++) = sa / sample_dens;
161  }
162  }
163  m_marked = std::move(t_bv(marked));
164  util::init_support(m_rank_marked, &m_marked);
165  }
166 
169  : base_type(st)
170  {
171  m_marked = st.m_marked;
172  m_rank_marked = st.m_rank_marked;
173  m_rank_marked.set_vector(&m_marked);
174  }
175 
177  inline bool is_sampled(size_type i) const { return m_marked[i]; }
178 
180  inline value_type operator[](size_type i) const { return base_type::operator[](m_rank_marked(i)) * sample_dens; }
181 
183 
186  {
187  if (this != &st)
188  {
190  m_marked = st.m_marked;
191  m_rank_marked = st.m_rank_marked;
192  m_rank_marked.set_vector(&m_marked);
193  }
194  return *this;
195  }
196 
199  {
200  base_type::swap(st);
201  m_marked.swap(st.m_marked);
202  util::swap_support(m_rank_marked, st.m_rank_marked, &m_marked, &(st.m_marked));
203  }
204 
205  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
206  {
207  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
208  size_type written_bytes = 0;
209  written_bytes += base_type::serialize(out, child, "samples");
210  written_bytes += m_marked.serialize(out, child, "marked");
211  written_bytes += m_rank_marked.serialize(out, child, "rank_marked");
212  structure_tree::add_size(child, written_bytes);
213  return written_bytes;
214  }
215 
216  void load(std::istream & in)
217  {
218  base_type::load(in);
219  m_marked.load(in);
220  m_rank_marked.load(in);
221  m_rank_marked.set_vector(&m_marked);
222  }
223 
224  template <typename archive_t>
225  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
226  {
228  ar(CEREAL_NVP(m_marked));
229  ar(CEREAL_NVP(m_rank_marked));
230  }
231 
232  template <typename archive_t>
233  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
234  {
236  ar(CEREAL_NVP(m_marked));
237  ar(CEREAL_NVP(m_rank_marked));
238  m_rank_marked.set_vector(&m_marked);
239  }
240 };
241 
242 template <class t_bit_vec = sd_vector<>, class t_rank_sup = typename t_bit_vec::rank_1_type, uint8_t t_width = 0>
244 {
245  template <class t_csa>
248 };
249 
250 template <class t_csa,
251  class t_bv_sa = sd_vector<>,
252  class t_bv_isa = sd_vector<>,
253  class t_rank_sa = typename t_bv_sa::rank_1_type,
254  class t_select_isa = typename t_bv_isa::select_1_type>
256 {
257  private:
258  t_bv_sa m_marked_sa;
259  t_rank_sa m_rank_marked_sa;
260  t_bv_isa m_marked_isa;
261  t_select_isa m_select_marked_isa;
262  wt_int<rrr_vector<63>> m_inv_perm;
263 
264  public:
265  typedef typename bit_vector::size_type size_type; // make typedefs of base_type visible
266  typedef typename bit_vector::value_type value_type; //
267  typedef t_bv_sa bv_sa_type;
268  enum
269  {
270  sample_dens = t_csa::sa_sample_dens
271  };
272  enum
273  {
274  text_order = true
275  };
277 
278  const t_bv_sa & marked_sa = m_marked_sa;
279  const t_rank_sa & rank_marked_sa = m_rank_marked_sa;
280  const t_bv_isa & marked_isa = m_marked_isa;
281  const t_select_isa & select_marked_isa = m_select_marked_isa;
282 
285 
287  /*
288  * \param cconfig Cache configuration (SA is expected to be cached.).
289  * \param csa Pointer to the corresponding CSA. Not used in this class.
290  * \par Time complexity
291  * Linear in the size of the suffix array.
292  */
293  _fuzzy_sa_sampling(cache_config & cconfig, SDSL_UNUSED const t_csa * csa = nullptr)
294  {
295  {
296  // (2) check, if the suffix array is cached
297  if (!cache_file_exists(conf::KEY_ISA, cconfig))
298  {
299  auto event = memory_monitor::event("ISA");
300  construct_isa(cconfig);
301  }
303  }
304  {
306  size_type n = isa_buf.size();
307  bit_vector marked_isa(n, 0); // temporary bitvector for marked ISA positions
308  bit_vector marked_sa(n, 0); // temporary bitvector for marked SA positions
309  int_vector<> inv_perm((n + sample_dens - 1) / sample_dens, 0, bits::hi(n) + 1);
310  size_type cnt = 0;
311  size_type runs = 1;
312 
313  uint64_t min_prev_val = 0;
314  for (size_type i = 0; i < n; i += sample_dens)
315  {
316  size_type pos_min = i;
317  size_type pos_cnd = isa_buf[i] >= min_prev_val ? i : n;
318  for (size_type j = i + 1; j < i + sample_dens and j < n; ++j)
319  {
320  if (isa_buf[j] < isa_buf[pos_min]) pos_min = j;
321  if (isa_buf[j] >= min_prev_val)
322  {
323  if (pos_cnd == n) { pos_cnd = j; }
324  else if (isa_buf[j] < isa_buf[pos_cnd])
325  {
326  pos_cnd = j;
327  }
328  }
329  }
330  if (pos_cnd == n)
331  { // increasing sequence can not be extended
332  pos_cnd = pos_min;
333  ++runs;
334  }
335  min_prev_val = isa_buf[pos_cnd];
336  marked_isa[pos_cnd] = 1;
337  inv_perm[cnt++] = min_prev_val;
338  marked_sa[min_prev_val] = 1;
339  }
340  m_marked_isa = std::move(t_bv_isa(marked_isa));
341  util::init_support(m_select_marked_isa, &m_marked_isa);
342  {
344  for (size_type i = 0; i < inv_perm.size(); ++i) { inv_perm[i] = rank_marked_sa(inv_perm[i]); }
345  }
346  util::bit_compress(inv_perm);
347 
348  m_marked_sa = std::move(t_bv_sa(marked_sa));
349  util::init_support(m_rank_marked_sa, &m_marked_sa);
350 
351  std::string tmp_key = "fuzzy_isa_samples_" + util::to_string(util::pid()) + "_" +
353  std::string tmp_file_name = cache_file_name(tmp_key, cconfig);
354  store_to_file(inv_perm, tmp_file_name);
355  construct(m_inv_perm, tmp_file_name, 0);
356  sdsl::remove(tmp_file_name);
357  }
358  }
359 
362  : m_marked_sa(st.m_marked_sa)
363  , m_rank_marked_sa(st.m_rank_marked_sa)
364  , m_marked_isa(st.m_marked_isa)
365  , m_select_marked_isa(st.m_select_marked_isa)
366  , m_inv_perm(st.m_inv_perm)
367  {
368  m_rank_marked_sa.set_vector(&m_marked_sa);
369  m_select_marked_isa.set_vector(&m_marked_isa);
370  }
371 
374  : m_marked_sa(std::move(st.m_marked_sa))
375  , m_rank_marked_sa(std::move(st.m_rank_marked_sa))
376  , m_marked_isa(std::move(st.m_marked_isa))
377  , m_select_marked_isa(std::move(st.m_select_marked_isa))
378  , m_inv_perm(std::move(st.m_inv_perm))
379  {
380  m_rank_marked_sa.set_vector(&m_marked_sa);
381  m_select_marked_isa.set_vector(&m_marked_isa);
382  }
383 
385  inline bool is_sampled(size_type i) const { return m_marked_sa[i]; }
386 
389  {
390  return m_select_marked_isa(m_inv_perm.select(1, m_rank_marked_sa(i)) + 1);
391  }
392 
394  inline value_type inv(size_type i) const { return m_inv_perm[i]; }
395 
396  size_type size() const { return m_inv_perm.size(); }
397 
400  {
401  if (this != &st)
402  {
403  _fuzzy_sa_sampling tmp(st);
404  *this = std::move(tmp);
405  }
406  return *this;
407  }
408 
411  {
412  m_marked_sa = std::move(st.m_marked_sa);
413  m_rank_marked_sa = std::move(st.m_rank_marked_sa);
414  m_marked_isa = std::move(st.m_marked_isa);
415  m_select_marked_isa = std::move(st.m_select_marked_isa);
416  m_inv_perm = std::move(st.m_inv_perm);
417  m_rank_marked_sa.set_vector(&m_marked_sa);
418  m_select_marked_isa.set_vector(&m_marked_isa);
419  return *this;
420  }
421 
422  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
423  {
424  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
425  size_type written_bytes = 0;
426  written_bytes += m_marked_sa.serialize(out, child, "marked_sa");
427  written_bytes += m_rank_marked_sa.serialize(out, child, "rank_marked_sa");
428  written_bytes += m_marked_isa.serialize(out, child, "marked_isa");
429  written_bytes += m_select_marked_isa.serialize(out, child, "select_marked_isa");
430  written_bytes += m_inv_perm.serialize(out, child, "inv_perm");
431  structure_tree::add_size(child, written_bytes);
432  return written_bytes;
433  }
434 
435  void load(std::istream & in)
436  {
437  m_marked_sa.load(in);
438  m_rank_marked_sa.load(in);
439  m_rank_marked_sa.set_vector(&m_marked_sa);
440  m_marked_isa.load(in);
441  m_select_marked_isa.load(in);
442  m_select_marked_isa.set_vector(&m_marked_isa);
443  m_inv_perm.load(in);
444  }
445 
446  template <typename archive_t>
447  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
448  {
449  ar(CEREAL_NVP(m_marked_sa));
450  ar(CEREAL_NVP(m_rank_marked_sa));
451  ar(CEREAL_NVP(m_marked_isa));
452  ar(CEREAL_NVP(m_select_marked_isa));
453  ar(CEREAL_NVP(m_inv_perm));
454  }
455 
456  template <typename archive_t>
457  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
458  {
459  ar(CEREAL_NVP(m_marked_sa));
460  ar(CEREAL_NVP(m_rank_marked_sa));
461  m_rank_marked_sa.set_vector(&m_marked_sa);
462  ar(CEREAL_NVP(m_marked_isa));
463  ar(CEREAL_NVP(m_select_marked_isa));
464  m_select_marked_isa.set_vector(&m_marked_isa);
465  ar(CEREAL_NVP(m_inv_perm));
466  }
467 
469  bool operator==(_fuzzy_sa_sampling const & other) const noexcept
470  {
471  return (m_marked_sa == other.m_marked_sa) && (m_rank_marked_sa == other.m_rank_marked_sa) &&
472  (m_marked_isa == other.m_marked_isa) && (m_select_marked_isa == other.m_select_marked_isa) &&
473  (m_inv_perm == other.m_inv_perm);
474  }
475 
477  bool operator!=(_fuzzy_sa_sampling const & other) const noexcept { return !(*this == other); }
478 };
479 template <class t_bv_sa = sd_vector<>,
480  class t_bv_isa = sd_vector<>,
481  class t_rank_sa = typename t_bv_sa::rank_1_type,
482  class t_select_isa = typename t_bv_isa::select_1_type>
484 {
485  template <class t_csa>
488 };
489 
490 /*
491  * Text = ABCDEFABCDEF$
492  * 0123456789012
493  * sa_sample_dens = 4
494  * sa_sample_chars = {B,E}
495  * SA BWT (1)
496  * 12 F * $
497  * 06 F ABCDEF$
498  * 00 $ * ABCDEFABCDEF$
499  * 07 A BCDEF$
500  * 01 A BCDEFABCDEF$
501  * 08 B * CDEF$
502  * 02 B * CDEFABCDEF$
503  * 09 C DEF$
504  * 03 C DEFABCDEF$
505  * 10 D EF$
506  * 04 D * EFABCDEF$
507  * 11 E * F$
508  * 05 E * FABCDEF$
509  *
510  * In this sampling a suffix x=SA[i] is marked if x \‍( 0 \equiv x \mod sa_sample_dens \‍) or
511  * BWT[i] is contained in sa_sample_chars.
512  */
513 
514 template <class t_csa, class t_bv = bit_vector, class t_rank = typename t_bv::rank_1_type, uint8_t t_width = 0>
515 class _bwt_sampling : public int_vector<t_width>
516 {
517  private:
518  t_bv m_marked;
519  t_rank m_rank_marked;
520 
521  public:
523  typedef typename base_type::size_type size_type; // make typedefs of base_type visible
524  typedef typename base_type::value_type value_type; //
525  enum
526  {
527  sample_dens = t_csa::sa_sample_dens
528  };
529  enum
530  {
531  text_order = false
532  };
534 
537 
539  /*
540  * \param cconfig Cache configuration (BWT,SA, and SAMPLE_CHARS are expected to be cached.).
541  * \param csa Pointer to the corresponding CSA. Not used in this class.
542  * \par Time complexity
543  * Linear in the size of the suffix array.
544  */
545  _bwt_sampling(const cache_config & cconfig, SDSL_UNUSED const t_csa * csa = nullptr)
546  {
549  key_bwt<t_csa::alphabet_type::int_width>(), cconfig));
550  size_type n = sa_buf.size();
551  bit_vector marked(n, 0); // temporary bitvector for the marked text positions
552  this->width(bits::hi(n) + 1);
553  int_vector<> sample_char;
554  typedef typename t_csa::char_type char_type;
555  std::set<char_type> char_map;
556  if (load_from_cache(sample_char, conf::KEY_SAMPLE_CHAR, cconfig))
557  {
558  for (uint64_t i = 0; i < sample_char.size(); ++i) { char_map.insert((char_type)sample_char[i]); }
559  }
560  size_type sa_cnt = 0;
561  for (size_type i = 0; i < n; ++i)
562  {
563  size_type sa = sa_buf[i];
564  char_type bwt = bwt_buf[i];
565  if (0 == (sa % sample_dens))
566  {
567  marked[i] = 1;
568  ++sa_cnt;
569  }
570  else if (char_map.find(bwt) != char_map.end())
571  {
572  marked[i] = 1;
573  ++sa_cnt;
574  }
575  }
576  this->resize(sa_cnt);
577  sa_cnt = 0;
578  for (size_type i = 0; i < n; ++i)
579  {
580  size_type sa = sa_buf[i];
581  if (marked[i]) { base_type::operator[](sa_cnt++) = sa; }
582  }
583  m_marked = std::move(marked);
584  util::init_support(m_rank_marked, &m_marked);
585  }
586 
589  : base_type(st)
590  {
591  m_marked = st.m_marked;
592  m_rank_marked = st.m_rank_marked;
593  m_rank_marked.set_vector(&m_marked);
594  }
595 
597  inline bool is_sampled(size_type i) const { return m_marked[i]; }
598 
600  inline value_type operator[](size_type i) const { return base_type::operator[](m_rank_marked(i)) * sample_dens; }
601 
604  {
605  if (this != &st)
606  {
608  m_marked = st.m_marked;
609  m_rank_marked = st.m_rank_marked;
610  m_rank_marked.set_vector(&m_marked);
611  }
612  return *this;
613  }
614 
616  void swap(_bwt_sampling & st)
617  {
618  base_type::swap(st);
619  m_marked.swap(st.m_marked);
620  util::swap_support(m_rank_marked, st.m_rank_marked, &m_marked, &(st.m_marked));
621  }
622 
623  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
624  {
625  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
626  size_type written_bytes = 0;
627  written_bytes += base_type::serialize(out, child, "samples");
628  written_bytes += m_marked.serialize(out, child, "marked");
629  written_bytes += m_rank_marked.serialize(out, child, "rank_marked");
630  structure_tree::add_size(child, written_bytes);
631  return written_bytes;
632  }
633 
634  void load(std::istream & in)
635  {
636  base_type::load(in);
637  m_marked.load(in);
638  m_rank_marked.load(in);
639  m_rank_marked.set_vector(&m_marked);
640  }
641 
642  template <typename archive_t>
643  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
644  {
646  ar(CEREAL_NVP(m_marked));
647  ar(CEREAL_NVP(m_rank_marked));
648  }
649 
650  template <typename archive_t>
651  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
652  {
654  ar(CEREAL_NVP(m_marked));
655  ar(CEREAL_NVP(m_rank_marked));
656  m_rank_marked.set_vector(&m_marked);
657  }
658 };
659 
660 template <class t_bit_vec = bit_vector, class t_rank_sup = typename t_bit_vec::rank_1_type, uint8_t t_width = 0>
662 {
663  template <class t_csa>
666 };
667 
668 template <class t_csa, uint8_t t_width = 0>
669 class _isa_sampling : public int_vector<t_width>
670 {
671  public:
673  typedef typename base_type::size_type size_type; // make typedefs of base_type visible
674  typedef typename base_type::value_type value_type; //
675  typedef typename t_csa::sa_sample_type sa_type; // sa sample type
676  enum
677  {
678  sample_dens = t_csa::isa_sample_dens
679  };
681 
684 
686  /*
687  * \param cconfig Cache configuration (SA is expected to be cached.).
688  * \param sa_sample Pointer to the corresponding SA sampling. Not used in this class.
689  * \par Time complexity
690  * Linear in the size of the suffix array.
691  */
692  _isa_sampling(const cache_config & cconfig, SDSL_UNUSED const sa_type * sa_sample = nullptr)
693  {
695  size_type n = sa_buf.size();
696  if (n >= 1)
697  { // so n+t_csa::isa_sample_dens >= 2
698  this->width(bits::hi(n) + 1);
699  this->resize((n - 1) / sample_dens + 1);
700  }
701  for (size_type i = 0; i < this->size(); ++i) base_type::operator[](i) = 0;
702 
703  for (size_type i = 0; i < n; ++i)
704  {
705  size_type sa = sa_buf[i];
706  if ((sa % sample_dens) == 0) { base_type::operator[](sa / sample_dens) = i; }
707  }
708  }
709 
712 
714  inline std::tuple<value_type, size_type> sample_leq(size_type i) const
715  {
716  size_type ci = i / sample_dens;
717  return std::make_tuple(base_type::operator[](ci), ci * sample_dens);
718  }
719 
721  inline std::tuple<value_type, size_type> sample_qeq(size_type i) const
722  {
723  size_type ci = (i / sample_dens + 1) % this->size();
724  return std::make_tuple(base_type::operator[](ci), ci * sample_dens);
725  }
726 
728  void load(std::istream & in, SDSL_UNUSED const sa_type * sa_sample = nullptr) { base_type::load(in); }
729 
730  template <typename archive_t>
731  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
732  {
734  }
735 
736  template <typename archive_t>
737  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
738  {
740  }
741 
742  void set_vector(SDSL_UNUSED const sa_type *) {}
743 };
744 
745 template <uint8_t t_width = 0>
747 {
748  template <class t_csa>
751 };
752 
753 template <class t_csa, class t_inv_perm, class t_sel>
755 {
756  static_assert(t_csa::sa_sample_dens == t_csa::isa_sample_dens,
757  "ISA sampling requires: sa_sample_dens == isa_sample_dens");
758 
759  public:
762  typedef typename t_csa::sa_sample_type sa_type; // sa sample type
763  typedef typename sa_type::bv_type bv_type; // bitvector type used to mark SA samples
764  enum
765  {
766  sample_dens = t_csa::isa_sample_dens
767  };
769 
770  private:
771  t_sel m_select_marked;
772  t_inv_perm m_inv_perm;
773 
774  public:
775  const t_sel & select_marked = m_select_marked;
776 
779 
781  /*
782  * \param cconfig Cache configuration. (Not used in this class)
783  * \param sa_sample Pointer to the corresponding SA sampling..
784  * \par Time complexity
785  * Linear in the size of the suffix array.
786  */
788  const typename std::enable_if<sa_type::text_order, sa_type *>::type sa_sample)
789  {
790  // and initialize the select support on bitvector marked
791  m_select_marked = t_sel(&(sa_sample->marked));
792  const int_vector<> * perm = (const int_vector<> *)sa_sample;
793  m_inv_perm = t_inv_perm(perm);
794  m_inv_perm.set_vector(perm);
795  }
796 
799  {
800  m_inv_perm = st.m_inv_perm;
801  m_select_marked = st.m_select_marked;
802  }
803 
805  inline value_type operator[](size_type i) const { return m_select_marked(m_inv_perm[i / sample_dens] + 1); }
806 
808  inline std::tuple<value_type, size_type> sample_leq(size_type i) const
809  {
810  size_type ci = i / sample_dens;
811  return std::make_tuple(m_select_marked(m_inv_perm[ci] + 1), ci * sample_dens);
812  }
813 
815  inline std::tuple<value_type, size_type> sample_qeq(size_type i) const
816  {
817  size_type ci = (i / sample_dens + 1) % m_inv_perm.size();
818  return std::make_tuple(m_select_marked(m_inv_perm[ci] + 1), ci * sample_dens);
819  }
820 
823  {
824  if (this != &st)
825  {
826  m_inv_perm = st.m_inv_perm;
827  m_select_marked = st.m_select_marked;
828  }
829  return *this;
830  }
831 
834  {
835  if (this != &st)
836  {
837  m_inv_perm.swap(st.m_inv_perm);
838  m_select_marked.swap(st.m_select_marked);
839  }
840  }
841 
842  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
843  {
844  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
845  size_type written_bytes = 0;
846  written_bytes += m_inv_perm.serialize(out, child, "inv_perm");
847  written_bytes += m_select_marked.serialize(out, child, "select_marked");
848  structure_tree::add_size(child, written_bytes);
849  return written_bytes;
850  }
851 
853  void load(std::istream & in, const sa_type * sa_sample = nullptr)
854  {
855  m_inv_perm.load(in);
856  m_select_marked.load(in);
857  set_vector(sa_sample);
858  }
859 
860  template <typename archive_t>
861  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
862  {
863  ar(CEREAL_NVP(m_inv_perm));
864  ar(CEREAL_NVP(m_select_marked));
865  }
866 
867  template <typename archive_t>
868  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar, const sa_type * sa_sample = nullptr)
869  {
870  ar(CEREAL_NVP(m_inv_perm));
871  ar(CEREAL_NVP(m_select_marked));
872  set_vector(sa_sample);
873  }
874 
876  bool operator==(_text_order_isa_sampling_support const & other) const noexcept
877  {
878  return (m_inv_perm == other.m_inv_perm) && (m_select_marked == other.m_select_marked);
879  }
880 
882  bool operator!=(_text_order_isa_sampling_support const & other) const noexcept { return !(*this == other); }
883 
884  void set_vector(const sa_type * sa_sample = nullptr)
885  {
886  if (sa_sample == nullptr)
887  {
888  m_select_marked.set_vector(nullptr);
889  m_inv_perm.set_vector(nullptr);
890  }
891  else
892  {
893  m_select_marked.set_vector(&(sa_sample->marked));
894  m_inv_perm.set_vector((const int_vector<> *)sa_sample);
895  }
896  }
897 };
898 
899 template <class t_inv_perm = inv_perm_support<8>, class t_sel = void>
901 {
902  template <class t_csa>
904  t_inv_perm,
905  typename std::conditional<std::is_void<t_sel>::value,
906  typename t_csa::sa_sample_type::bv_type::select_1_type,
907  t_sel>::type>;
909 };
910 
911 template <class t_csa, class t_select_sa>
913 {
914  static_assert(t_csa::sa_sample_dens == t_csa::isa_sample_dens,
915  "ISA sampling requires: sa_sample_dens==isa_sample_dens");
916 
917  public:
920  typedef typename t_csa::sa_sample_type sa_type; // sa sample type
921  enum
922  {
923  sample_dens = t_csa::isa_sample_dens
924  };
926 
927  private:
928  const sa_type * m_sa_p = nullptr; // pointer to sa_sample_strategy
929  t_select_sa m_select_marked_sa;
930 
931  public:
934 
936  /*
937  * \param cconfig Cache configuration. (Not used in this class)
938  * \param sa_sample Pointer to the corresponding SA sampling..
939  * \par Time complexity
940  * Linear in the size of the suffix array.
941  */
942  _fuzzy_isa_sampling_support(SDSL_UNUSED const cache_config & cconfig, const sa_type * sa_sample)
943  : m_sa_p(sa_sample)
944  {
945  util::init_support(m_select_marked_sa, &(sa_sample->marked_sa));
946  }
947 
950  : m_select_marked_sa(st.m_select_marked_sa)
951  {
952  set_vector(st.m_sa_p);
953  }
954 
956  inline value_type operator[](size_type i) const { return m_sa_p->inv(i); }
957 
959  inline std::tuple<value_type, size_type> sample_leq(size_type i) const
960  {
961  size_type ci = i / sample_dens;
962  size_type j = m_sa_p->select_marked_isa(ci + 1);
963  if (j > i)
964  {
965  if (ci > 0) { ci = ci - 1; }
966  else
967  {
968  ci = m_sa_p->size() - 1;
969  }
970  j = m_sa_p->select_marked_isa(ci + 1);
971  }
972  return std::make_tuple(m_select_marked_sa(m_sa_p->inv(ci) + 1), j);
973  }
974 
976  inline std::tuple<value_type, size_type> sample_qeq(size_type i) const
977  {
978  size_type ci = i / sample_dens;
979  size_type j = m_sa_p->select_marked_isa(ci + 1);
980  if (j < i)
981  {
982  if (ci < m_sa_p->size() - 1) { ci = ci + 1; }
983  else
984  {
985  ci = 0;
986  }
987  j = m_sa_p->select_marked_isa(ci + 1);
988  }
989  return std::make_tuple(m_select_marked_sa(m_sa_p->inv(ci) + 1), j);
990  }
991 
994  {
995  if (this != &st)
996  {
997  m_select_marked_sa = st.m_select_marked_sa;
998  set_vector(st.m_sa_p);
999  }
1000  return *this;
1001  }
1002 
1004  void swap(_fuzzy_isa_sampling_support & st) { m_select_marked_sa.swap(st.m_select_marked_sa); }
1005 
1006  size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
1007  {
1008  structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
1009  size_type written_bytes = 0;
1010  written_bytes += m_select_marked_sa.serialize(out, v, "select_marked_sa");
1011  structure_tree::add_size(child, written_bytes);
1012  return written_bytes;
1013  }
1014 
1016  void load(std::istream & in, const sa_type * sa_sample = nullptr)
1017  {
1018  m_select_marked_sa.load(in);
1019  set_vector(sa_sample);
1020  }
1021 
1022  template <typename archive_t>
1023  void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
1024  {
1025  ar(CEREAL_NVP(m_select_marked_sa));
1026  }
1027 
1028  template <typename archive_t>
1029  void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar, const sa_type * sa_sample = nullptr)
1030  {
1031  ar(CEREAL_NVP(m_select_marked_sa));
1032  set_vector(sa_sample);
1033  }
1034 
1036  bool operator==(_fuzzy_isa_sampling_support const & other) const noexcept
1037  {
1038  return (m_select_marked_sa == other.m_select_marked_sa);
1039  }
1040 
1042  bool operator!=(_fuzzy_isa_sampling_support const & other) const noexcept { return !(*this == other); }
1043 
1044  void set_vector(const sa_type * sa_sample = nullptr)
1045  {
1046  m_sa_p = sa_sample;
1047  if (nullptr != m_sa_p) { m_select_marked_sa.set_vector(&(sa_sample->marked_sa)); }
1048  }
1049 };
1050 
1051 template <class t_select_sa = void>
1053 {
1054  template <class t_csa>
1056  typename std::conditional<std::is_void<t_select_sa>::value,
1057  typename t_csa::sa_sample_type::bv_sa_type::select_1_type,
1058  t_select_sa>::type>;
1060 };
1061 
1062 } // namespace sdsl
1063 
1064 #endif
#define CEREAL_NVP(X)
Definition: cereal.hpp:30
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
_bwt_sampling()
Default constructor.
base_type::value_type value_type
void load(std::istream &in)
_bwt_sampling(const _bwt_sampling &st)
Copy constructor.
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
_bwt_sampling(const cache_config &cconfig, SDSL_UNUSED const t_csa *csa=nullptr)
Constructor.
int_vector< t_width > base_type
_bwt_sampling & operator=(const _bwt_sampling &st)
Assignment operation.
base_type::size_type size_type
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
value_type operator[](size_type i) const
Return the suffix array value for the sampled index i.
bool is_sampled(size_type i) const
Determine if index i is sampled or not.
void swap(_bwt_sampling &st)
Swap operation.
sa_sampling_tag sampling_category
bool operator!=(_fuzzy_isa_sampling_support const &other) const noexcept
Inequality operator.
_fuzzy_isa_sampling_support(SDSL_UNUSED const cache_config &cconfig, const sa_type *sa_sample)
Constructor.
void load(std::istream &in, const sa_type *sa_sample=nullptr)
Load sampling from disk.
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar, const sa_type *sa_sample=nullptr)
_fuzzy_isa_sampling_support(const _fuzzy_isa_sampling_support &st)
Copy constructor.
void swap(_fuzzy_isa_sampling_support &st)
Swap operation.
std::tuple< value_type, size_type > sample_qeq(size_type i) const
Returns the leftmost ISA sample >= i and its position.
value_type operator[](size_type i) const
Return the inverse suffix array value for the sampled index i.
bool operator==(_fuzzy_isa_sampling_support const &other) const noexcept
Equality operator.
_fuzzy_isa_sampling_support & operator=(const _fuzzy_isa_sampling_support &st)
Assignment operation.
std::tuple< value_type, size_type > sample_leq(size_type i) const
Returns the rightmost ISA sample <= i and its position.
_fuzzy_isa_sampling_support()
Default constructor.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
void set_vector(const sa_type *sa_sample=nullptr)
_fuzzy_sa_sampling(const _fuzzy_sa_sampling &st)
Copy constructor.
bool operator==(_fuzzy_sa_sampling const &other) const noexcept
Equality operator.
_fuzzy_sa_sampling(_fuzzy_sa_sampling &&st)
Move constructor.
const t_select_isa & select_marked_isa
value_type inv(size_type i) const
Return the inv permutation at position i (already condensed!!!)
value_type operator[](size_type i) const
Return the suffix array value for the sampled index i.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
_fuzzy_sa_sampling & operator=(const _fuzzy_sa_sampling &st)
Assignment operation.
bit_vector::value_type value_type
bool is_sampled(size_type i) const
Determine if index i is sampled or not.
_fuzzy_sa_sampling & operator=(_fuzzy_sa_sampling &&st)
Move assignment operation.
_fuzzy_sa_sampling()
Default constructor.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
_fuzzy_sa_sampling(cache_config &cconfig, SDSL_UNUSED const t_csa *csa=nullptr)
Constructor.
bit_vector::size_type size_type
bool operator!=(_fuzzy_sa_sampling const &other) const noexcept
Inequality operator.
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
_isa_sampling()
Default constructor.
value_type operator[](size_type i) const
Returns the ISA value at position j, where.
isa_sampling_tag sampling_category
base_type::value_type value_type
_isa_sampling(const cache_config &cconfig, SDSL_UNUSED const sa_type *sa_sample=nullptr)
Constructor.
int_vector< t_width > base_type
std::tuple< value_type, size_type > sample_leq(size_type i) const
Returns the rightmost ISA sample <= i and its position.
void set_vector(SDSL_UNUSED const sa_type *)
base_type::size_type size_type
std::tuple< value_type, size_type > sample_qeq(size_type i) const
Returns the leftmost ISA sample >= i and its position.
t_csa::sa_sample_type sa_type
void load(std::istream &in, SDSL_UNUSED const sa_type *sa_sample=nullptr)
Load sampling from disk.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
int_vector< t_width > base_type
value_type operator[](size_type i) const
Return the suffix array value for the sampled index i.
base_type::size_type size_type
bool is_sampled(size_type i) const
Determine if index i is sampled or not.
_sa_order_sampling(const cache_config &cconfig, SDSL_UNUSED const t_csa *csa=nullptr)
Constructor.
_sa_order_sampling()
Default constructor.
base_type::value_type value_type
std::tuple< value_type, size_type > sample_qeq(size_type i) const
Returns the leftmost ISA sample >= i and its position.
_text_order_isa_sampling_support(const _text_order_isa_sampling_support &st)
Copy constructor.
value_type operator[](size_type i) const
Return the inverse suffix array value for the sampled index i.
bool operator==(_text_order_isa_sampling_support const &other) const noexcept
Equality operator.
_text_order_isa_sampling_support & operator=(const _text_order_isa_sampling_support &st)
Assignment operation.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
std::tuple< value_type, size_type > sample_leq(size_type i) const
Returns the rightmost ISA sample <= i and its position.
void load(std::istream &in, const sa_type *sa_sample=nullptr)
Load sampling from disk.
bool operator!=(_text_order_isa_sampling_support const &other) const noexcept
Inequality operator.
_text_order_isa_sampling_support(SDSL_UNUSED const cache_config &cconfig, const typename std::enable_if< sa_type::text_order, sa_type * >::type sa_sample)
Constructor.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar, const sa_type *sa_sample=nullptr)
void swap(_text_order_isa_sampling_support &st)
Swap operation.
void set_vector(const sa_type *sa_sample=nullptr)
value_type condensed_sa(size_type i) const
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
_text_order_sampling(const cache_config &cconfig, SDSL_UNUSED const t_csa *csa=nullptr)
Constructor.
_text_order_sampling(const _text_order_sampling &st)
Copy constructor.
value_type operator[](size_type i) const
Return the suffix array value for the sampled index i.
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
bool is_sampled(size_type i) const
Determine if index i is sampled or not.
void swap(_text_order_sampling &st)
Swap operation.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
_text_order_sampling & operator=(const _text_order_sampling &st)
Assignment operation.
_text_order_sampling()
Default constructor.
uint64_t size() const
Returns the number of elements currently stored.
A generic vector class for integers of width .
Definition: int_vector.hpp:253
reference operator[](const size_type &i) noexcept
non const version of [] operator
std::enable_if<!cereal::traits::is_output_serializable< cereal::BinaryData< int_vector< t_width > >, archive_t >::value, void >::type CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal if archive is not binary.
void swap(int_vector &v) noexcept
Swap method for int_vector.
Definition: int_vector.hpp:527
int_vector_size_type size_type
Definition: int_vector.hpp:266
int_vector & operator=(const int_vector &v)
Assignment operator.
void load(std::istream &in)
Load the int_vector for a stream.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:619
size_type size() const noexcept
The number of elements in the int_vector.
iterator insert(const_iterator it, value_type value)
Insert an element before the element that the iterator is pointing to.
Definition: int_vector.hpp:403
std::enable_if<!cereal::traits::is_input_serializable< cereal::BinaryData< int_vector< t_width > >, archive_t >::value, void >::type CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Serialise (load) via cereal if archive is not binary.
int_vector_trait< t_width >::value_type value_type
Definition: int_vector.hpp:255
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the int_vector to a stream.
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:545
static mm_event_proxy event(const std::string &name)
A rank structure proposed by Sebastiano Vigna.
A bit vector which compresses very sparse populated bit vectors by.
Definition: sd_vector.hpp:115
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
A wavelet tree class for integer sequences.
Definition: wt_int.hpp:49
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition: wt_int.hpp:755
size_type size() const
Returns the size of the original vector.
Definition: wt_int.hpp:304
size_type select(size_type i, value_type c) const
Calculates the i-th occurrence of the symbol c in the supported vector.
Definition: wt_int.hpp:431
void load(std::istream &in)
Loads the data structure from the given istream.
Definition: wt_int.hpp:771
#define SDSL_UNUSED
Definition: config.hpp:13
csa_alphabet_strategy.hpp includes different strategy classes for representing an alphabet of a CSA.
int_vector.hpp contains the sdsl::int_vector class.
inv_perm_support.hpp contains a class which adds access to the inverse of a permutation.
constexpr char KEY_SAMPLE_CHAR[]
Definition: config.hpp:45
constexpr char KEY_SA[]
Definition: config.hpp:37
constexpr char KEY_ISA[]
Definition: config.hpp:40
uint64_t id()
uint64_t pid()
void bit_compress(t_int_vec &v)
Bit compress the int_vector.
Definition: util.hpp:487
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool load_from_cache(T &v, const std::string &key, const cache_config &config, bool add_type_hash=false)
Definition: io.hpp:711
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition: construct.hpp:45
void construct_isa(cache_config &config)
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
Helper class for construction process.
Definition: config.hpp:67
wavelet_trees.hpp contains wavelet tree implementations.