SDSL  3.0.0
Succinct Data Structure Library
suffix_tree_algorithm.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
8 #ifndef INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
9 #define INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
10 
11 #include <iterator>
12 
14 
15 namespace sdsl
16 {
17 
19 
31 template <class t_cst>
32 typename t_cst::size_type
33 forward_search(const t_cst & cst,
34  typename t_cst::node_type & v,
35  const typename t_cst::size_type d,
36  const typename t_cst::char_type c,
37  typename t_cst::size_type & char_pos,
39  typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
40  cst_tag())
41 {
42  auto cc = cst.csa.char2comp[c]; // check if c occurs in the text of the csa
43  if (cc == 0 and cc != c) // " " " " " " " " " "
44  return 0;
45  typename t_cst::size_type depth_node = cst.depth(v);
46  if (d < depth_node)
47  { // in an edge, no branching
48  char_pos = cst.csa.psi[char_pos];
49  if (char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc + 1]) return 0;
50  return cst.size(v);
51  }
52  else if (d == depth_node)
53  { // at a node, branching
54  v = cst.child(v, c, char_pos);
55  if (v == cst.root())
56  return 0;
57  else
58  return cst.size(v);
59  }
60  else
61  {
62  return 0;
63  }
64 }
65 
67 
78 template <class t_cst, class t_pat_iter>
79 typename t_cst::size_type
80 forward_search(const t_cst & cst,
81  typename t_cst::node_type & v,
82  typename t_cst::size_type d,
83  t_pat_iter begin,
84  t_pat_iter end,
85  typename t_cst::size_type & char_pos,
87  typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
88  cst_tag())
89 {
90  if (begin == end) return cst.size(v);
91  typename t_cst::size_type size = 0;
92  t_pat_iter it = begin;
93  while (it != end and (size = forward_search(cst, v, d, *it, char_pos)))
94  {
95  ++d;
96  ++it;
97  }
98  return size;
99 }
100 
102 
114 template <class t_cst, class t_pat_iter>
115 typename t_cst::size_type count(const t_cst & cst, t_pat_iter begin, t_pat_iter end, cst_tag)
116 {
117  return count(cst.csa, begin, end);
118 }
119 
121 
135 template <class t_cst, class t_pat_iter, class t_rac = int_vector<64>>
136 t_rac locate(const t_cst & cst,
137  t_pat_iter begin,
138  t_pat_iter end,
140  typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
141  cst_tag())
142 {
143  return locate(cst.csa, begin, end);
144 }
145 
147 
157 template <class t_cst, class t_text_iter>
158 typename t_cst::size_type extract(const t_cst & cst,
159  const typename t_cst::node_type & v,
160  t_text_iter text,
162  typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
163  cst_tag>::type x = cst_tag())
164 {
165  if (v == cst.root())
166  {
167  text[0] = 0;
168  return 0;
169  }
170  // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
171  typename t_cst::size_type begin = cst.csa[cst.lb(v)];
172  // then call the extract method on the compressed suffix array
173  extract(cst.csa, begin, begin + cst.depth(v) - 1, text);
174 }
175 
177 
185 template <class t_cst>
186 typename t_cst::csa_type::string_type
187 extract(const t_cst & cst,
188  const typename t_cst::node_type & v,
190  typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
191  cst_tag())
192 {
193  typedef typename t_cst::csa_type::string_type t_rac;
194  if (v == cst.root()) { return t_rac(0); }
195  // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
196  typename t_cst::size_type begin = cst.csa[cst.lb(v)];
197  // then call the extract method on the compressed suffix array
198  return extract(cst.csa, begin, begin + cst.depth(v) - 1);
199 }
200 
202 
208 template <class t_cst>
209 double H0(const typename t_cst::node_type & v, const t_cst & cst)
210 {
211  if (cst.is_leaf(v)) { return 0; }
212  else
213  {
214  double h0 = 0;
215  auto n = cst.size(v);
216  for (const auto & child : cst.children(v))
217  {
218  double p = ((double)cst.size(child)) / n;
219  h0 -= p * log2(p);
220  }
221  return h0;
222  }
223 }
224 
226 
231 template <class t_cst>
232 std::pair<double, size_t> Hk(const t_cst & cst, typename t_cst::size_type k)
233 {
234  double hk = 0;
235  size_t context = 0;
236  std::set<typename t_cst::size_type> leafs_with_d_smaller_k;
237  for (typename t_cst::size_type d = 1; d < k; ++d)
238  {
239  leafs_with_d_smaller_k.insert(cst.csa.isa[cst.csa.size() - d]);
240  }
241  for (typename t_cst::const_iterator it = cst.begin(), end = cst.end(); it != end; ++it)
242  {
243  if (it.visit() == 1)
244  {
245  if (!cst.is_leaf(*it))
246  {
247  typename t_cst::size_type d = cst.depth(*it);
248  if (d >= k)
249  {
250  if (d == k) { hk += cst.size(*it) * H0(*it, cst); }
251  ++context;
252  it.skip_subtree();
253  }
254  }
255  else
256  {
257  // if d of leaf is >= k, add context
258  if (leafs_with_d_smaller_k.find(cst.lb(*it)) == leafs_with_d_smaller_k.end()) { ++context; }
259  }
260  }
261  }
262  hk /= cst.size();
263  return { hk, context };
264 }
265 
266 } // namespace sdsl
267 #endif
size_type size() const noexcept
The number of elements in the int_vector.
#define SDSL_UNUSED
Definition: config.hpp:13
int_vector ::size_type size_type
Namespace for the succinct data structure library.
uint64_t count(const t_k2_treap &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
std::pair< double, size_t > Hk(const t_cst &cst, typename t_cst::size_type k)
Calculate the k-th order entropy of a text.
t_csa::size_type forward_search(const t_csa &csa, typename t_csa::size_type l, typename t_csa::size_type r, t_pat_iter begin, t_pat_iter end, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Forward search for a pattern in an -interval in the CSA.
t_rac locate(const t_csa &csa, t_pat_iter begin, t_pat_iter end, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Calculates all occurrences of a pattern pat in a CSA.
t_csa::size_type extract(const t_csa &csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Writes the substring T[begin..end] of the original text T to text[0..end-begin+1].
double H0(const typename t_cst::node_type &v, const t_cst &cst)
Calculate the zeroth order entropy of the text that follows a certain substring s.
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
suffix_array_algorithm.hpp contains algorithms on CSAs