8 #ifndef INCLUDED_SDSL_CONSTRUCT_LCP
9 #define INCLUDED_SDSL_CONSTRUCT_LCP
54 template <u
int8_t t_w
idth>
57 static_assert(t_width == 0 or t_width == 8,
58 "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet");
71 for (
size_type i = 0, j = 0, sa_1 = 0, l = 0, n = isa_buf.
size(); i < n; ++i)
79 while (text[i + l] == text[j + l])
92 for (
size_type i = sa.
size(); i > 1; --i) { sa[i - 1] = sa[i - 2]; }
115 template <u
int8_t t_w
idth>
118 static_assert(t_width == 0 or t_width == 8,
119 "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
136 for (
size_type i = 0, sai_1 = 0; i < n; ++i)
149 for (
size_type i = 0, l = 0; i < n - 1; ++i)
152 while (text[i + l] == text[phii + l]) { ++l; }
156 max_l = std::max(max_l, l);
161 uint8_t lcp_width =
bits::hi(max_l) + 1;
172 lcp_buf[i] = plcp[sai];
206 const uint8_t log_q = 6;
207 const uint32_t q = 1 << log_q;
213 for (
size_type i = 0, sai_1 = 0; i < n; ++i)
217 if ((sai & modq) == 0)
219 if ((sai >> log_q) >= plcp.
size())
224 plcp[sai >> log_q] = sai_1;
236 while (text[j + l] == text[k + l]) ++l;
238 if (l >= q) { l -= q; }
252 for (
size_type i = 0, sai_1 = 0, l = 0, sai = 0, iq = 0; i < n; ++i)
256 if ((sai & modq) == 0)
258 lcp_out_buf[i] = l = plcp[sai >> log_q];
263 l = plcp[sai >> log_q];
268 while (text[sai + l] == text[sai_1 + l]) ++l;
273 for (j = 0; j < l; ++j)
275 if (text[sai + j] != text[sai_1 + j])
277 std::cout <<
"lcp[" << i <<
"]=" << l <<
" is two big! " << j <<
" is right!"
278 <<
" sai=" << sai << std::endl;
279 if ((sai & modq) != 0)
280 std::cout <<
" plcp[sai>>log_q]=" << plcp[sai >> log_q] <<
" sai-iq=" << sai - iq <<
" sai=" << sai
281 <<
" sai-iq=" << sai - iq << std::endl;
315 #ifdef STUDY_INFORMATIONS
339 unsigned char alphabet[257] = { 0 };
344 uint8_t m_chars[2][256] = { { 0 }, { 0 } };
356 ++cnt_c[text[i] + 1];
358 for (
int i = 1; i < 257; ++i)
360 if (cnt_c[i] > 0) { alphabet[sigma++] = (
unsigned char)(i - 1); }
361 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
363 alphabet[sigma] =
'\0';
367 uint8_t bwti_1 = bwt_buf[0];
368 lcp_sml[cnt_cc[bwti_1]++] = 0;
369 prev_occ_in_bwt[bwti_1] = 0;
370 ++omitted_c[alphabet[0]];
380 uint8_t cur_c = alphabet[1];
382 for (
size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
384 uint8_t bwti = bwt_buf[i];
389 if (cur_c_cnt < sigma) { cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1]; }
392 if (i >= cnt_cc[cur_c])
394 if (bwti == bwti_1 and lf < i)
396 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
399 l += (text[sai_1 + m] == text[sai + m]);
400 #ifdef STUDY_INFORMATIONS
401 if ((sai_1 ^ sai) >> 6)
410 if (lf < i) l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
411 #ifdef STUDY_INFORMATIONS
412 if ((sai_1 ^ sai) >> 6)
415 while (text[sai_1 + l] == text[sai + l] and l < m + 1)
418 #ifdef STUDY_INFORMATIONS
432 if (i > 10000 and i < 10500 and big_val > 3000)
435 util::clear(lcp_sml);
436 construct_lcp_PHI<8>(config);
444 while (x <= rmq_stack[j]) j -= 2;
445 rmq_stack[++j] = i + 1;
453 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
455 while (x_pos <= rmq_stack[j]) j -= 2;
456 lcp_sml[lf] = rmq_stack[j + 3] -
457 (rmq_stack[j + 3] == m + 2);
461 if (l == m)
push_front_m_index(nn, cur_c, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
467 prev_occ_in_bwt[bwti] = i;
475 if (n > 1000 and nn > 5 * (n / 6))
477 util::clear(lcp_sml);
478 construct_lcp_PHI<8>(config);
483 #ifdef STUDY_INFORMATIONS
484 std::cout <<
"# n=" << n <<
" nn=" << nn <<
" nn/n=" << ((double)nn) / n << std::endl;
502 if (lcp_sml_buf[i] >= m) { todo[i] = 1; }
506 cnt_cc2[0] = cnt_cc[0] = 0;
507 for (
size_type i = 1, omitted_sum = 0; i < 257; ++i)
509 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
510 omitted_sum += omitted_c[i - 1];
511 cnt_cc2[i] = cnt_cc[i] - omitted_sum;
515 for (
size_type i = 0, i2 = 0; i < n; ++i)
517 uint8_t b = bwt_buf[i];
523 lcp_big[i2] = cnt_cc2[b];
549 for (
size_type i = 0, i2 = 0; i < n; ++i)
551 uint8_t b = bwt_buf[i];
552 if (lcp_sml_buf[i] >= m)
555 shift_bwt2[i2] = b_1;
576 for (
size_type i = 0; i < 256; ++i) char_ex[i] = nn;
578 size_type m_mod2 = m2 % 2, mm1_mod2 = (m2 + 1) % 2;
579 while (m_char_count[m_mod2] > 0)
584 mm1_mod2 = (m2 + 1) % 2, m_mod2 = m2 % 2;
585 m_char_count[m_mod2] = 0;
587 std::sort(m_chars[mm1_mod2],
588 m_chars[mm1_mod2] + m_char_count[mm1_mod2]);
590 for (
size_type mc = 0; mc < m_char_count[mm1_mod2]; ++mc)
592 tLI & mm1_mc_list = m_list[mm1_mod2][m_chars[mm1_mod2][m_char_count[mm1_mod2] - 1 - mc]];
594 while (!mm1_mc_list.empty())
597 mm1_mc_list.pop_front();
601 #ifdef STUDY_INFORMATIONS
604 uint8_t b = shift_bwt2[k];
612 for (
size_type k = i; todo2[k] and char_occ; ++k)
614 #ifdef STUDY_INFORMATIONS
625 if (!run2[k + 1])
break;
650 lcp_big_buf.
width());
651 for (
size_type i = 0, i2 = 0; i < n; ++i)
665 #ifdef STUDY_INFORMATIONS
666 std::cout <<
"# racs: " << racs << std::endl;
667 std::cout <<
"# matches: " << matches << std::endl;
668 std::cout <<
"# comps2: " << comps2 << std::endl;
713 unsigned char alphabet[257] = { 0 };
726 ++cnt_c[text[i] + 1];
728 for (
int i = 1; i < 257; ++i)
730 if (cnt_c[i] > 0) { alphabet[sigma++] = (
unsigned char)(i - 1); }
731 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
733 alphabet[sigma] =
'\0';
737 uint8_t bwti_1 = bwt_buf[0];
738 lcp_sml[cnt_cc[bwti_1]++] = 0;
739 prev_occ_in_bwt[bwti_1] = 0;
740 ++omitted_c[alphabet[0]];
749 uint8_t cur_c = alphabet[1];
750 for (
size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
752 uint8_t bwti = bwt_buf[i];
757 if (cur_c_cnt < sigma) { cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1]; }
760 if (i >= cnt_cc[cur_c])
762 if (bwti == bwti_1 and lf < i)
764 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
767 l += (text[sai_1 + m] == text[sai + m]);
774 if (lf < i) l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
775 while (text[sai_1 + l] == text[sai + l] and l < m + 1) { ++l; }
787 while (x <= rmq_stack[j]) j -= 2;
788 rmq_stack[++j] = i + 1;
796 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
798 while (x_pos <= rmq_stack[j]) j -= 2;
799 lcp_sml[lf] = rmq_stack[j + 3] -
800 (rmq_stack[j + 3] == m + 2);
806 prev_occ_in_bwt[bwti] = i;
828 if (lcp_sml_buf[i] > m) { todo[sa_buf[i]] = 1; }
830 sa_n_1 = sa_buf[n - 1];
840 for (
size_type i = 0, sai_1 = 0; i < n; ++i)
842 uint8_t b = bwt_buf[i];
844 if (lcp_sml_buf[i] > m and b != b_1)
846 phi[todo_rank(sai)] = sai_1;
854 for (
size_type i = 0, ii = 0, l = m + 1, p = 0; i < n and ii < nn; ++i)
858 if (i > 0 and todo[i - 1])
862 if ((p = phi[ii]) != bot)
864 while (text[i + l] == text[p + l]) ++l;
874 for (
size_type i = 0, ii = 0; i < n and ii < nn; ++i)
876 if (lcp_sml_buf[i] > m) { lcp_big[ii++] = phi[todo_rank(sa_buf[i])]; }
895 lcp_big_buf.
width());
897 for (
size_type i = 0, i2 = 0; i < n; ++i)
907 lcp_big_buf.
close(
true);
908 lcp_sml_buf.
close(
true);
930 template <
typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
940 uint64_t n = wt_bwt.size();
952 std::queue<size_type> q;
953 std::vector<bit_vector> dict(2);
955 bool queue_used =
true;
960 std::vector<unsigned char> cs(wt_bwt.sigma);
961 std::vector<size_type> rank_c_i(wt_bwt.sigma);
962 std::vector<size_type> rank_c_j(wt_bwt.sigma);
968 if (n * 20 <
size_in_bytes(wt_bwt) * 8 * 1.25 + 5 * n) { bb = 6; }
971 size_type lcp_value_max = (1ULL << bb) - 1;
974 #ifdef STUDY_INFORMATIONS
975 std::cout <<
"# l=" << n <<
" b=" << (int)bb <<
" lcp_value_max=" << lcp_value_max
976 <<
" size_in_bytes(wt_bwt<v,bs,bs>)=" <<
size_in_bytes(wt_bwt) << std::endl;
987 std::vector<size_type> C;
995 index_done[0] =
true;
999 unsigned char c = cs[i];
1004 if (!index_done[b_new])
1006 if (b_new < n) partial_lcp[b_new] = lcp_value;
1007 index_done[b_new] =
true;
1019 if (intervals < use_queue_and_wt && !queue_used)
1022 util::clear(dict[target]);
1027 while (b2 < dict[source].
size())
1029 q.push((a2 - 1) >> 1);
1035 util::clear(dict[source]);
1038 if (intervals >= use_queue_and_wt && queue_used)
1041 dict[source].resize(2 * (n + 1));
1047 dict[source][(q.front() << 1) + 1] = 1;
1049 dict[source][(q.front() << 1)] = 1;
1052 dict[target].resize(2 * (n + 1));
1058 if (intervals < use_queue_and_wt)
1072 for (
size_type i = 0; i < quantity; ++i)
1074 unsigned char c = cs[i];
1079 if (!index_done[b_new] and phase == 0)
1081 partial_lcp[b_new] = lcp_value;
1082 index_done[b_new] =
true;
1088 else if (!index_done[b_new])
1091 if (!partial_lcp[insert_pos])
1093 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1102 intervals = intervals_new;
1113 while (b2 < dict[source].
size())
1115 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1116 for (
size_type i = 0; i < quantity; ++i)
1118 unsigned char c = cs[i];
1122 if (!index_done[b_new] and phase == 0)
1124 partial_lcp[b_new] = lcp_value;
1125 index_done[b_new] =
true;
1127 dict[target][(a_new << 1) + 1] = 1;
1128 dict[target][(b_new << 1)] = 1;
1131 else if (!index_done[b_new])
1134 if (!partial_lcp[insert_pos])
1136 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1138 dict[target][(a_new << 1) + 1] = 1;
1139 dict[target][(b_new << 1)] = 1;
1152 if (lcp_value >= lcp_value_max)
1155 if (phase) {
insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset); }
1162 util::init_support(ds_rank_support, &index_done);
1165 lcp_value_offset = lcp_value_max - 1;
1168 uint8_t int_width_new = std::min(space_in_bit_for_lcp / remaining_lcp_values,
1170 lcp_value_max = lcp_value_offset + (1ULL << int_width_new);
1171 #ifdef STUDY_INFORMATIONS
1172 std::cout <<
"# l=" << remaining_lcp_values <<
" b=" << (int)int_width_new
1173 <<
" lcp_value_max=" << lcp_value_max << std::endl;
1176 partial_lcp.
width(int_width_new);
1177 partial_lcp.
resize(remaining_lcp_values);
1188 if (phase) {
insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset); }
1214 template <
typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
1237 std::queue<size_type> q;
1238 std::vector<bit_vector> dict(2);
1240 bool queue_used =
true;
1245 std::vector<unsigned char> cs(wt_bwt.sigma);
1246 std::vector<size_type> rank_c_i(wt_bwt.sigma);
1247 std::vector<size_type> rank_c_j(wt_bwt.sigma);
1250 bool new_lcp_value =
false;
1251 uint8_t int_width =
bits::hi(n) + 2;
1260 std::vector<size_type> C;
1267 lcp_positions_buf[idx_out_buf++] = 0;
1270 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1271 new_lcp_value =
false;
1273 index_done[0] =
true;
1277 for (
size_type i = 0; i < quantity; ++i)
1279 unsigned char c = cs[i];
1284 if (!index_done[b_new])
1289 lcp_positions_buf[idx_out_buf++] = b_new;
1291 index_done[b_new] =
true;
1300 new_lcp_value =
true;
1305 if (intervals < use_queue_and_wt && !queue_used)
1308 util::clear(dict[target]);
1313 while (b2 < dict[source].
size())
1315 q.push((a2 - 1) >> 1);
1321 util::clear(dict[source]);
1324 if (intervals >= use_queue_and_wt && queue_used)
1327 dict[source].resize(2 * (n + 1));
1332 dict[source][(q.front() << 1) + 1] = 1;
1334 dict[source][(q.front() << 1)] = 1;
1337 dict[target].resize(2 * (n + 1));
1342 if (intervals < use_queue_and_wt)
1356 for (
size_type i = 0; i < quantity; ++i)
1358 unsigned char c = cs[i];
1363 if (!index_done[b_new])
1366 lcp_positions_buf[idx_out_buf++] = b_new;
1370 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1371 new_lcp_value =
false;
1373 index_done[b_new] =
true;
1382 intervals = intervals_new;
1392 while (b2 < dict[source].
size())
1394 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1395 for (
size_type i = 0; i < quantity; ++i)
1397 unsigned char c = cs[i];
1401 if (!index_done[b_new])
1404 lcp_positions_buf[idx_out_buf++] = b_new;
1408 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1409 new_lcp_value =
false;
1411 index_done[b_new] =
true;
1413 dict[target][(a_new << 1) + 1] = 1;
1414 dict[target][(b_new << 1)] = 1;
1426 new_lcp_value =
true;
1429 lcp_positions_buf.
close();
1437 uint8_t int_width =
bits::hi(lcp_value + 1) + 1;
1442 size_type number_of_values = ((n / ((int_width - 1ULL) / 8 + 1) + 16) & (~(0x7ULL)));
1446 number_of_values * int_width / 8,
1448 number_of_values = lcp_array.
buffersize() * 8 / int_width;
1450 for (
size_type position_begin = 0, position_end = number_of_values; position_begin < n and number_of_values > 0;
1451 position_begin = position_end, position_end += number_of_values)
1453 #ifdef STUDY_INFORMATIONS
1454 std::cout <<
"# number_of_values=" << number_of_values <<
" fill lcp_values with " << position_begin
1455 <<
" <= position <" << position_end <<
", each lcp-value has " << (int)int_width
1456 <<
" bit, lcp_value_max=" << lcp_value <<
" n=" << n << std::endl;
1467 if (position_begin <= position and position < position_end) { lcp_array[position] = lcp_value; }
1473 lcp_positions.
close(
true);
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
uint8_t width() const
Returns the width of the integers which are accessed via the [] operator.
void shrink_to_fit()
Free unused allocated memory.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
size_type size() const noexcept
The number of elements in the int_vector.
reference front() noexcept
Returns first element.
void resize(const size_type size)
Resize the int_vector in terms of elements.
static mm_event_proxy event(const std::string &name)
A rank structure proposed by Sebastiano Vigna.
size_type rank(size_type idx) const
Answers rank queries for the supported bit_vector.
construct_bwt.hpp contains a space and time efficient construction method for the Burrows and Wheeler...
construct_isa.hpp contains a space and time efficient construction method for the inverse suffix arra...
int_vector.hpp contains the sdsl::int_vector class.
constexpr char KEY_TEXT[]
int_vector ::size_type size_type
Get the smallest position f $i geq idx f where a bit is set t_int_vec::size_type next_bit(const t_int_vec &v, uint64_t idx)
void set_to_value(t_int_vec &v, uint64_t k)
Set all entries of int_vector to value k.
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
bool load_from_cache(T &v, const std::string &key, const cache_config &config, bool add_type_hash=false)
void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
void interval_symbols(const t_wt &wt, typename t_wt::size_type i, typename t_wt::size_type j, typename t_wt::size_type &k, std::vector< typename t_wt::value_type > &cs, std::vector< typename t_wt::size_type > &rank_c_i, std::vector< typename t_wt::size_type > &rank_c_j)
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
void construct_lcp_kasai(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void construct_lcp_bwt_based(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void create_C_array(std::vector< uint64_t > &C, const tWT &wt)
void insert_lcp_values(int_vector<> &partial_lcp, bit_vector &index_done, std::string lcp_file, uint64_t max_lcp_value, uint64_t lcp_value_offset)
Merges a partial LCP array into the LCP array on disk.
void construct_lcp_PHI(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void swap(int_vector_reference< t_int_vector > x, int_vector_reference< t_int_vector > y) noexcept
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_lcp_goPHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_isa(cache_config &config)
void construct_lcp_bwt_based2(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
bool store_to_cache(const T &v, const std::string &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
void construct_lcp_go(cache_config &config)
Construct the LCP array (only for byte strings)
int_vector ::size_type size(const range_type &r)
Size of a range.
std::list< int_vector<>::size_type > tLI
void push_front_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
T::size_type size_in_bytes(const T &t)
Get the size of a data structure in bytes.
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
sfstream.hpp contains a two stream class which can be used to read/write from/to files or strings.
constexpr static uint64_t lo_unset[65]
lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
constexpr static uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Helper class for construction process.
Helper classes to transform width=0 and width=8 to corresponding text key.
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.