8 #ifndef INCLUDED_SDSL_CST_SCT3
9 #define INCLUDED_SDSL_CST_SCT3
34 #include <type_traits>
77 template <
class t_csa = csa_wt<>,
78 class t_lcp = lcp_dac<>,
79 class t_bp_support = bp_support_sada<>,
80 class t_bv = bit_vector,
81 class t_rank =
typename std::conditional<std::is_same<t_bv, bit_vector>::value,
83 typename t_bv::rank_1_type>::type,
85 t_sel =
typename std::conditional<std::is_same<t_bv,
86 bit_vector>::value and std::is_same<
typename t_csa::alphabet_category,
87 byte_alphabet_tag>::value,
88 select_support_scan<>,
89 typename t_bv::select_1_type>::type>
93 "First template argument has to be a compressed suffix array.");
101 typedef typename t_lcp::template type<cst_sct3>
lcp_type;
111 typedef typename t_csa::alphabet_type::sigma_type
sigma_type;
142 assert(m_bp[ckpos] == 0);
143 kpos = m_bp_support.find_open(ckpos);
144 return m_bp_support.rank(kpos) - 1;
156 if (v.cipos > v.jp1pos)
158 ckpos = v.jp1pos - 1;
164 assert(m_bp[ckpos] == 0);
167 kpos = m_bp_support.find_open(ckpos);
168 return m_bp_support.rank(kpos) - 1;
173 size_type r = ckpos - m_bp_support.rank(ckpos);
180 assert(m_bp[ckpos] == 0);
181 kpos = m_bp_support.find_open(ckpos);
182 return m_bp_support.rank(kpos) - 1;
187 if (kpos < m_bp.
size())
188 ckpos = m_bp_support.find_close(kpos);
224 size_type cipos = m_bp_support.find_close(ipos);
225 size_type result = m_bp_support.rank(cipos);
248 psvcpos = m_bp.
size() - 1;
253 psvpos = m_bp_support.enclose(ipos);
254 psvcpos = m_bp_support.find_close(psvpos);
255 return m_bp_support.rank(psvpos) - 1;
258 size_type r0 = cipos - m_bp_support.rank(cipos);
260 const uint64_t * p = m_first_child.data() + (r0 >> 6);
261 uint64_t w = (*p) >> (r0 & 0x3F);
264 next_first_child = cipos +
bits::lo(w);
265 if (cipos == next_first_child and m_bp[next_first_child + 1])
267 psvpos = m_bp_support.enclose(ipos);
268 psvcpos = m_bp_support.find_close(psvpos);
269 return m_bp_support.rank(psvpos) - 1;
277 while (!(w = *p) and steps-- > 0)
282 if (w != 0) { delta +=
bits::lo(w) + 1; }
285 auto pos = m_first_child_select(m_first_child_rank(r0 + 1) + 1);
288 next_first_child = cipos + delta;
290 if (!m_bp[next_first_child + 1])
292 psvcpos = next_first_child + 1;
293 psvpos = m_bp_support.find_open(psvcpos);
294 return m_bp_support.rank(psvpos) - 1;
298 psvpos = m_bp_support.enclose(m_bp_support.find_open(next_first_child));
299 psvcpos = m_bp_support.find_close(psvpos);
300 return m_bp_support.rank(psvpos) - 1;
310 size_type i = m_bp_support.select(l + 1);
311 size_type j = m_bp_support.select(r + 1);
312 size_type fc_i = m_bp_support.find_close(i);
319 size_type ec = m_bp_support.rr_enclose(i, j);
320 if (ec == m_bp_support.size())
326 return m_bp_support.rank(ec) - 1;
359 , m_bp_support(cst.m_bp_support)
360 , m_first_child(cst.m_first_child)
361 , m_first_child_rank(cst.m_first_child_rank)
362 , m_first_child_select(cst.m_first_child_select)
363 , m_nodes(cst.m_nodes)
366 m_bp_support.set_vector(&m_bp);
367 m_first_child_rank.set_vector(&m_first_child);
368 m_first_child_select.set_vector(&m_first_child);
376 : m_csa(std::move(cst.m_csa))
377 , m_bp(std::move(cst.m_bp))
378 , m_bp_support(std::move(cst.m_bp_support))
379 , m_first_child(std::move(cst.m_first_child))
380 , m_first_child_rank(std::move(cst.m_first_child_rank))
381 , m_first_child_select(std::move(cst.m_first_child_select))
382 , m_nodes(cst.m_nodes)
385 m_bp_support.set_vector(&m_bp);
386 m_first_child_rank.set_vector(&m_first_child);
387 m_first_child_select.set_vector(&m_first_child);
408 bool empty()
const {
return m_csa.empty(); }
416 if (0 == m_bp.
size())
424 if (0 == m_bp.
size() and
root() == v)
return end();
444 if (0 == m_bp.
size())
461 *
this = std::move(tmp);
474 m_csa = std::move(cst.m_csa);
476 m_bp = std::move(cst.m_bp);
477 m_bp_support = std::move(cst.m_bp_support);
478 m_bp_support.set_vector(&m_bp);
479 m_first_child = std::move(cst.m_first_child);
480 m_first_child_rank = std::move(cst.m_first_child_rank);
481 m_first_child_rank.set_vector(&m_first_child);
482 m_first_child_select = std::move(cst.m_first_child_select);
483 m_first_child_select.set_vector(&m_first_child);
484 m_nodes = std::move(cst.m_nodes);
498 void load(std::istream & in);
501 template <
typename archive_t>
505 template <
typename archive_t>
511 return (m_csa == other.m_csa) && (m_lcp == other.m_lcp) && (m_bp == other.m_bp) &&
512 (m_bp_support == other.m_bp_support) && (m_first_child == other.m_first_child) &&
513 (m_first_child_rank == other.m_first_child_rank) &&
514 (m_first_child_select == other.m_first_child_select) ;
549 assert(i > 0 and i <=
size());
553 jp1pos = m_bp.
size();
554 else if (m_bp[ipos + 1])
557 jp1pos = m_bp_support.select(i + 1);
558 return node_type(i - 1, i - 1, ipos, m_bp_support.find_close(ipos), jp1pos);
615 size_type psv_pos, psv_cpos, psv_v, nsv_v, nsv_p1pos;
616 psv_v = psv(v.
j + 1, v.
jp1pos, m_bp_support.find_close(v.
jp1pos), psv_pos, psv_cpos);
617 nsv_v = nsv(v.
j + 1, v.
jp1pos) - 1;
618 if (nsv_v ==
size() - 1) { nsv_p1pos = m_bp.
size(); }
621 nsv_p1pos = m_bp_support.select(nsv_v + 2);
623 return node_type(psv_v, nsv_v, psv_pos, psv_cpos, nsv_p1pos);
628 psv_v = psv(v.
i, v.
ipos, v.
cipos, psv_pos, psv_cpos);
662 bool last_child = m_bp[cjp1posm1];
666 size_type first_child_idx = cjp1posm1 - m_bp_support.rank(cjp1posm1);
667 last_child = m_first_child[first_child_idx];
673 if (nsv_v ==
size() - 1) { nsv_p1pos = m_bp.
size(); }
676 nsv_p1pos = m_bp_support.select(nsv_v + 2);
682 size_type new_j = m_bp_support.rank(m_bp_support.find_open(cjp1posm1)) - 2;
686 m_bp_support.find_close(v.
jp1pos),
687 m_bp_support.select(new_j + 2));
709 size_type k = 0, kpos = 0, k_find_close = 0;
711 k = select_l_index(v, 1, kpos, k_find_close);
717 k1 = select_l_index(v, i - 1, kpos1, k_find_close1);
718 if (k1 == v.
j + 1)
return root();
720 k2 = select_l_index(v, i, kpos2, k_find_close2);
721 return node_type(k1, k2 - 1, kpos1, k_find_close1, kpos2);
738 size_type r = closing_pos_of_first_l_index(v);
740 const uint64_t * p = m_first_child.data() + (r0 >> 6);
741 uint8_t offset = r0 & 0x3F;
748 else if (m_first_child.data() == p)
757 while (p > m_first_child.data() and steps-- > 0)
768 auto goal_rank = m_first_child_rank(r0);
769 if (goal_rank == 0) {
return r0 + 2; }
772 return r0 - m_first_child_select(goal_rank) + 1;
794 if (cc == 0 and c != 0)
796 size_type char_ex_max_pos = m_csa.C[((
size_type)1) + cc], char_inc_min_pos = m_csa.C[cc];
802 if (char_pos >= char_ex_max_pos)
807 else if (char_pos >= char_inc_min_pos)
816 if (char_pos < char_inc_min_pos)
821 else if (char_pos < char_ex_max_pos)
827 size_type l_bound = 2, r_bound = child_cnt, mid, kpos, ckpos, l_index;
828 while (l_bound < r_bound)
830 mid = (l_bound + r_bound) >> 1;
832 l_index = select_l_index(v, mid - 1, kpos, ckpos);
835 if (char_inc_min_pos > char_pos) { l_bound = mid + 1; }
836 else if (char_ex_max_pos <= char_pos)
844 size_type lp1_index = m_bp_support.rank(m_bp_support.find_open(ckpos - 1)) - 1;
846 if (lp1_index - 1 <
size() - 1) { jp1pos = m_bp_support.select(lp1_index + 1); }
847 return node_type(l_index, lp1_index - 1, kpos, ckpos, jp1pos);
858 return child(v, c, char_pos);
873 assert(d <=
depth(v));
876 while (c_begin < c_end)
878 mid = (c_begin + c_end) >> 1;
879 if (m_csa.C[mid] <= order) { c_begin = mid + 1; }
885 return m_csa.comp2char[c_begin - 1];
907 size_type min_index_pos = m_bp_support.select(min_index + 1);
908 size_type min_index_cpos = m_bp_support.find_close(min_index_pos);
910 if (min_index_cpos >= (m_bp.
size() - m_csa.sigma))
914 size_type new_j = nsv(min_index, min_index_pos) - 1;
916 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos);
918 if (new_j <
size() - 1) { jp1pos = m_bp_support.select(new_j + 2); }
919 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos);
932 if (v ==
root()) {
return 0; }
935 return size() - m_csa[v.
i];
940 size_type l = select_l_index(v, 1, kpos, ckpos);
982 if (v.
i == 0 and v.
j == 0)
990 size_type min_index_pos = m_bp_support.select(min_index + 1);
991 size_type min_index_cpos = m_bp_support.find_close(min_index_pos);
992 if (min_index_cpos >= (m_bp.
size() - m_csa.sigma))
996 size_type new_j = nsv(min_index, min_index_pos) - 1;
998 size_type new_i = psv(min_index, min_index_pos, min_index_cpos, new_ipos, new_icpos);
1000 if (new_j <
size() - 1) { jp1pos = m_bp_support.select(new_j + 2); }
1001 return node_type(new_i, new_j, new_ipos, new_icpos, jp1pos);
1016 size_type c_right = m_csa.bwt.rank(v.
j + 1, c);
1017 if (c_left == c_right)
1019 if (c_left + 1 == c_right)
1020 return select_leaf(m_csa.C[m_csa.char2comp[c]] + c_left + 1);
1023 size_type left = m_csa.C[m_csa.char2comp[c]] + c_left;
1024 size_type right = m_csa.C[m_csa.char2comp[c]] + c_right - 1;
1025 assert(left < right);
1027 size_type ipos = m_bp_support.select(left + 1);
1029 if (right <
size() - 1) { jp1pos = m_bp_support.select(right + 2); }
1030 return node_type(left, right, ipos, m_bp_support.find_close(ipos), jp1pos);
1066 ckpos = v.
cipos - 1;
1068 assert(m_bp[ckpos] == 0);
1069 size_type r0ckpos = ckpos - m_bp_support.rank(ckpos);
1070 return size() + m_first_child_rank(r0ckpos);
1093 id =
id -
size() + 1;
1099 size_type arg = m_first_child_rank(mid);
1100 if (arg <
id) {
lb = mid; }
1117 size_type arg = mid - m_bp_support.rank(mid - 1);
1118 if (arg < r0ckpos + 1) {
lb = mid; }
1126 if (ckpos == m_bp.
size() - 1) {
return root(); }
1127 if (m_bp[ckpos + 1])
1130 size_type j = m_bp_support.rank(jp1pos - 1) - 1;
1131 size_type kpos = m_bp_support.find_open(ckpos);
1132 size_type ipos = m_bp_support.enclose(kpos);
1133 size_type cipos = m_bp_support.find_close(ipos);
1134 size_type i = m_bp_support.rank(ipos - 1);
1135 return node_type(i, j, ipos, cipos, jp1pos);
1140 size_type ipos = m_bp_support.find_open(cipos);
1141 size_type i = m_bp_support.rank(ipos - 1);
1144 if (j !=
size() - 1) { jp1pos = m_bp_support.select(j + 2); }
1145 return node_type(i, j, ipos, cipos, jp1pos);
1164 if (
rb ==
size() - 1) { jp1pos = m_bp.
size(); }
1167 jp1pos = m_bp_support.select(
rb + 2);
1169 return node_type(
lb,
rb, ipos, m_bp_support.find_close(ipos), jp1pos);
1179 size_type ipos = m_bp_support.select(i + 1);
1180 size_type cipos = m_bp_support.find_close(ipos);
1181 return m_first_child_rank.rank(((ipos + cipos - 1) >> 1) - i);
1188 template <
class t_csa,
class t_lcp,
class t_bp_support,
class t_bv,
class t_rank,
class t_sel>
1195 m_nodes += m_bp.
size() / 2;
1196 if (m_bp.
size() == 2)
1203 util::init_support(m_bp_support, &m_bp);
1204 util::init_support(m_first_child_rank, &m_first_child);
1205 util::init_support(m_first_child_select, &m_first_child);
1207 if (!build_only_bps)
1214 if (!build_only_bps)
1221 template <
class t_csa,
class t_lcp,
class t_bp_support,
class t_bv,
class t_rank,
class t_sel>
1228 written_bytes += m_csa.serialize(out, child,
"csa");
1229 written_bytes += m_lcp.serialize(out, child,
"lcp");
1230 written_bytes += m_bp.
serialize(out, child,
"bp");
1231 written_bytes += m_bp_support.serialize(out, child,
"bp_support");
1232 written_bytes += m_first_child.serialize(out, child,
"mark_child");
1233 written_bytes += m_first_child_rank.serialize(out, child,
"mark_child_rank");
1234 written_bytes += m_first_child_select.serialize(out, child,
"mark_child_select");
1235 written_bytes +=
write_member(m_nodes, out, child,
"node_cnt");
1237 return written_bytes;
1240 template <
class t_csa,
class t_lcp,
class t_bp_support,
class t_bv,
class t_rank,
class t_sel>
1246 m_bp_support.load(in, &m_bp);
1247 m_first_child.load(in);
1248 m_first_child_rank.load(in, &m_first_child);
1249 m_first_child_select.load(in, &m_first_child);
1253 template <
class t_csa,
class t_lcp,
class t_bp_support,
class t_bv,
class t_rank,
class t_sel>
1254 template <
typename archive_t>
1267 template <
class t_csa,
class t_lcp,
class t_bp_support,
class t_bv,
class t_rank,
class t_sel>
1268 template <
typename archive_t>
1276 m_bp_support.set_vector(&m_bp);
1279 m_first_child_rank.set_vector(&m_first_child);
1281 m_first_child_select.set_vector(&m_first_child);
1285 template <
class t_
int>
1310 if (
i != interval.
i)
return i < interval.
i;
1311 return j < interval.
j;
1331 template <
class t_
int>
1334 os <<
"-[" << interval.
i <<
"," << interval.
j <<
"](" << interval.
ipos <<
"," << interval.
cipos <<
","
1335 << interval.
jp1pos <<
")";
bp_support.hpp contains several classed which support find_open, find_close, enclose and rr-enclose q...
A forward iterator for a bottom up traversal of a suffix tree.
An forward iterator for (compressed) suffix trees.
A class for the Compressed Suffix Tree (CST) proposed by Ohlebusch and Gog.
t_csa::alphabet_type::sigma_type sigma_type
size_type id(const node_type &v) const
Computes a unique identification number for a node of the suffx tree in the range [0....
node_type parent(const node_type &v) const
Calculate the parent node of a node v.
node_type lca(node_type v, node_type w) const
Calculate the LCA of two nodes v and w
const_iterator begin() const
Returns a const_iterator to the first element of a depth first traversal of the tree.
const_bottom_up_iterator end_bottom_up() const
Returns an iterator to the element after the last element of a bottom-up traversal of the tree.
node_type rightmost_leaf(const node_type &v) const
Calculates the rightmost leaf in the subtree rooted at node v.
size_type size(const node_type &v) const
Calculate the number of leaves in the subtree rooted at node v.
node_type wl(const node_type &v, const char_type c) const
Compute the Weiner link of node v and character c.
const sel_type & first_child_select
t_bp_support bp_support_type
static size_type max_size()
Returns the largest size that cst_sct3 can ever have.
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Serialise (load) via cereal.
cst_sct3 & operator=(cst_sct3 &&cst)
Assignment Move Operator.
node_type select_leaf(size_type i) const
Return the i-th leaf (1-based from left to right).
node_type root() const
Return the root of the suffix tree.
node_type inv_id(size_type id)
Computes the node for such that id(v)=id.
size_type node_depth(node_type v) const
Returns the node depth of node v.
const_bottom_up_iterator begin_bottom_up() const
Returns an iterator to the first element of a bottom-up traversal of the tree.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serialize to a stream.
node_type node(size_type lb, size_type rb) const
Get the node in the suffix tree which corresponds to the lcp-interval [lb..rb].
size_type nodes() const
Get the number of nodes of the suffix tree.
cst_bottom_up_const_forward_iterator< cst_sct3 > const_bottom_up_iterator
const_iterator begin(const node_type &v) const
Returns a const_iterator to the first element of a depth first traversal of the subtree rooted at nod...
t_csa::string_type string_type
const bv_type & first_child_bv
bool empty() const
Returns if the data structure is empty.
cst_sct3(cst_sct3 &&cst)
Move constructor.
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
t_csa::char_type char_type
bool operator==(cst_sct3 const &other) const noexcept
Equality operator.
bool operator!=(cst_sct3 const &other) const noexcept
Inequality operator.
size_type size() const
Number of leaves of the suffix tree.
cst_sct3 & operator=(const cst_sct3 &cst)
Assignment Operator.
node_type child(const node_type &v, const char_type c) const
Get the child w of node v which edge label (v,w) starts with character c.
const rank_type & first_child_rank
const bp_support_type & bp_support
char_type edge(const node_type &v, size_type d) const
Returns the d-th character (1-based indexing) of the edge-label pointing to v.
node_type sl(const node_type &v) const
Compute the suffix link of node v.
node_type child(const node_type &v, const char_type c, size_type &char_pos) const
Get the child w of node v which edge label (v,w) starts with character c.
bool is_leaf(const node_type &v) const
Decide if a node is a leaf.
const_iterator end(const node_type &v) const
Returns a const_iterator to the element past the end of a depth first traversal of the subtree rooted...
const_iterator end() const
Returns a const_iterator to the element after the last element of a depth first traversal of the tree...
cst_dfs_const_forward_iterator< cst_sct3 > const_iterator
t_csa::size_type size_type
bp_interval< size_type > node_type
Type for the nodes in the tree.
node_type sibling(const node_type &v) const
Returns the next sibling of node v.
node_type leftmost_leaf(const node_type &v) const
Calculates the leftmost leaf in the subtree rooted at node v.
size_type depth(const node_type &v) const
Returns the string depth of node v.
t_lcp::template type< cst_sct3 > lcp_type
ptrdiff_t difference_type
size_type sn(const node_type &v) const
Computes the suffix number of a leaf node v.
size_type rb(const node_type &v) const
Calculates the index of the rightmost leaf in the corresponding suffix array.
cst_sct3(const cst_sct3 &cst)
Copy constructor.
t_csa::alphabet_category alphabet_category
t_csa::alphabet_type::comp_char_type comp_char_type
void load(std::istream &in)
Load from a stream.
node_type select_child(const node_type &v, size_type i) const
Get the i-th child of a node v.
size_type lb(const node_type &v) const
Calculates the index of the leftmost leaf in the corresponding suffix array.
cst_node_child_proxy< cst_sct3 > children(const node_type &v) const
Return a proxy object which allows iterating over the children of a node.
size_type tlcp_idx(size_type i) const
Maps an index i to the position in TLCP where LCP[i] can be found.
size_type degree(const node_type &v) const
Get the number of children of a node v.
cst_sct3()=default
Default constructor.
void load(std::istream &in)
Load the int_vector for a stream.
size_type size() const noexcept
The number of elements in the int_vector.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the int_vector to a stream.
static mm_event_proxy event(const std::string &name)
static void add_size(structure_tree_node *v, uint64_t value)
static structure_tree_node * add_child(structure_tree_node *v, const std::string &name, const std::string &type)
construct.hpp contains methods to construct indexes (compressed suffix arrays and trees).
csa_wt.hpp contains an implementation of the compressed suffix array based on a wavelet tree.
cst_iterators.hpp contains iterator classes for traversing (compressed) suffix arrays.
int_vector.hpp contains the sdsl::int_vector class.
iterators.hpp contains an generic iterator for random access containers.
lcp.hpp contains classes for lcp information.
int_vector ::size_type size_type
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
t_csa::size_type get_char_pos(typename t_csa::size_type idx, typename t_csa::size_type d, const t_csa &csa)
bool load_from_cache(T &v, const std::string &key, const cache_config &config, bool add_type_hash=false)
std::ostream & operator<<(std::ostream &os, const bp_interval< t_int > &interval)
void move_lcp(t_lcp &&lcp, t_lcp &&lcp_c, const t_cst &cst)
void swap(int_vector_reference< t_int_vector > x, int_vector_reference< t_int_vector > y) noexcept
void copy_lcp(t_lcp &lcp, const t_lcp &lcp_c, const t_cst &cst)
bit_vector::size_type construct_supercartesian_tree_bp_succinct_and_first_child(int_vector_buffer< t_width > &lcp_buf, bit_vector &bp, bit_vector &bp_fc, const bool minimum=true)
Calculate the balanced parentheses of the Super-Cartesian tree, described in Ohlebusch and Gog (SPIRE...
void construct_lcp(t_lcp &lcp, const t_cst &cst, cache_config &config)
size_t write_member(const T &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
void read_member(T &t, std::istream &in)
void set_lcp_pointer(t_lcp &lcp, const t_cst &cst)
void load_lcp(t_lcp &lcp, std::istream &in, const t_cst &cst)
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
Contains declarations and definitions of data structure concepts.
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
static SDSL_CONSTEXPR uint32_t lo(uint64_t x)
Calculates the position of the rightmost 1-bit in the 64bit integer x if it exists.
constexpr static uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
t_int i
The left border of the lcp-interval .
bp_interval(t_int i=0, t_int j=0, t_int ipos=0, t_int cipos=0, t_int jp1pos=0)
Constructor.
bp_interval(const bp_interval &iv)=default
Copy constructor.
bp_interval & operator=(bp_interval &&interval)=default
Move assignment.
t_int j
The right border of the lcp-interval .
bp_interval & operator=(const bp_interval &interval)=default
Assignment operator.
bool operator!=(const bp_interval &interval) const
Inequality operator.
bool operator==(const bp_interval &interval) const
Equality operator.
bool operator<(const bp_interval &interval) const
bp_interval(bp_interval &&iv)=default
Move copy constructor.
Helper class for construction process.
suffix_tree_algorithm.hpp contains algorithms on CSTs
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.