4 #ifndef SDSL_CONSTRUCT_SA_SE
5 #define SDSL_CONSTRUCT_SA_SE
20 template <
class int_vector_type>
23 if (i >= text.size() - 3) {
return text.size() - 1; }
25 uint64_t ci = text[i], cip1 = text[i + 1];
33 uint64_t candidate = i + 1;
38 if (i + 1 == text.size() - 1) {
return text.size() - 1; }
50 std::string & filename_sa,
56 uint64_t buffersize = 1024 * 1024 / 8;
59 size_t number_of_lms_strings = 0;
63 std::vector<uint64_t> bkt(sigma, 0);
66 for (
size_t i = 0; i < n; ++i) { ++bkt[text[text_offset + i]]; }
70 for (
size_t c = 0; c < sigma; ++c) { c_array[c] = bkt[c]; }
74 for (
size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + bkt[c]; }
77 for (
size_t i = n - 2, was_s_typ = 1; i < n; --i)
79 if (text[text_offset + i] > text[text_offset + i + 1])
83 sa[bkt[text[text_offset + i + 1]]--] = i + 1;
84 ++number_of_lms_strings;
88 else if (text[text_offset + i] < text[text_offset + i + 1])
96 for (
size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c - 1]; }
99 for (
size_t i = 0; i < n; ++i)
101 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
103 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
110 for (
size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
113 for (
size_t i = n - 1, endpointer = n; i < n; --i)
117 if (text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
119 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
123 sa[--endpointer] = sa[i];
130 for (
size_t i = n - 2, end = n - 2, was_s_typ = 1; i < n; --i)
132 if (text[text_offset + i] > text[text_offset + i + 1])
136 sa[(i + 1) >> 1] = end - i;
141 else if (text[text_offset + i] < text[text_offset + i + 1])
148 for (
size_t i = n - number_of_lms_strings + 1, cur_pos = 0, cur_len = 0, last_pos = n - 1, last_len = 1; i < n;
152 cur_len = sa[(cur_pos >> 1)];
153 if (cur_len == last_len)
156 while (l < cur_len and text[text_offset + cur_pos + l] == text[text_offset + last_pos + l]) { ++l; }
157 if (l >= cur_len) { --name; }
159 sa[(cur_pos >> 1)] = ++name;
166 if (name + 1 < number_of_lms_strings)
169 for (
size_t i = 0, t = n - number_of_lms_strings; i < (n >> 1); ++i)
184 number_of_lms_strings,
185 n - number_of_lms_strings,
189 for (
size_t i = n - 2, endpointer = n - 1, was_s_typ = 1; i < n; --i)
191 if (text[text_offset + i] > text[text_offset + i + 1])
195 sa[endpointer--] = i + 1;
199 else if (text[text_offset + i] < text[text_offset + i + 1])
206 for (
size_t i = 0; i < number_of_lms_strings; ++i)
209 sa[i] = sa[n - number_of_lms_strings + pos];
210 sa[n - number_of_lms_strings + pos] = 0;
217 for (
size_t i = 1; i < number_of_lms_strings; ++i)
219 sa[i] = sa[n - number_of_lms_strings + i];
220 sa[n - number_of_lms_strings + i] = 0;
223 for (
size_t i = number_of_lms_strings; i < (n >> 1); ++i) { sa[i] = 0; }
232 std::vector<uint64_t> bkt(sigma, 0);
233 for (
size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
236 for (
size_t i = number_of_lms_strings - 1; i < n; --i)
240 sa[bkt[text[text_offset + pos]]--] = pos;
245 for (
size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c - 1]; }
248 for (
size_t i = 0; i < n; ++i)
250 if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
252 sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
258 for (
size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
259 for (
size_t i = n - 1; i < n; --i)
261 if (sa[i] > 0 and text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
263 sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
270 template <
class int_vector_type>
271 void _construct_sa_se(int_vector_type & text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
275 uint64_t n = text.size();
277 uint8_t int_width =
bits::hi(n - 1) + 1;
278 uint64_t buffersize = 1024 * 1024 / 8;
283 size_t first_lms_pos = 0;
284 size_t number_of_lms_strings = 0;
285 size_t bkt_s_last = 0, bkt_s_sum = 0, bound_s = 0, bkt_l_sum = 0;
295 uint64_t ci = text[n - 1];
298 for (
size_t i = n - 2; i < n; --i)
305 ++bkt_s[text[i + 1]];
309 lms_pos_b[i + 1] = 1;
310 ++number_of_lms_strings;
311 first_lms_pos = i + 1;
320 if (was_s_typ) { ++bkt_s[ci]; }
321 bkt_l[0] = C[0] - bkt_s[0];
322 for (
size_t i = 1; i < C.
size(); ++i)
324 bkt_l[i] = C[i] - bkt_s[i];
325 C[i] = C[i] + C[i - 1];
335 size_t right_pointer = 0;
340 size_t left_pointer = 0;
342 for (
size_t i = 0, tmp2 = 0, tmp = 0; i < sigma; ++i)
352 for (
size_t i = n - 2, was_s_typ = 1, ci = text[n - 1]; i < n; --i)
374 int_vector<> lms_strings(number_of_lms_strings, 0, int_width);
375 for (
size_t i = 0; i < lms_positions.
size();)
377 size_t idx = lms_positions[i++];
378 size_t val = lms_positions[i++];
379 lms_strings[idx] = val;
381 lms_positions.
close(
true);
384 for (
size_t i = 0; i < number_of_lms_strings; ++i)
386 left[left_pointer++] = lms_strings[number_of_lms_strings - i - 1];
396 for (
size_t i = 0, tmp = 0; i < sigma; ++i)
399 bkt_l[i] = bkt_l_sum;
401 bkt_lms[i] = bkt_l[i];
404 size_t partsize = bkt_l_sum / parts + 1;
407 std::vector<int_vector_buffer<>> cached_array(parts - 1);
408 for (
size_t i = 0; i < cached_array.size(); ++i)
417 for (
size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
420 for (; pos < bkt_l[c]; ++pos)
423 if (pos - offset >= partsize)
426 for (
size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
428 size_t src = cached_array[cur_part][i++];
429 size_t val = cached_array[cur_part][i++];
430 array[src - offset] = val;
432 cached_array[pos / partsize - 1].reset();
435 size_t idx = array[pos - offset];
436 if (idx == 0) { right[right_pointer++] = idx; }
439 size_t symbol = text[idx - 1];
442 size_t val = idx - 1;
443 size_t src = bkt_l[symbol];
444 bkt_l[symbol] = bkt_l[symbol] + 1;
445 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
448 size_t part = src / partsize - 1;
450 cached_array[part].push_back(val);
455 right[right_pointer++] = idx;
460 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
462 size_t idx = left[left_pointer--];
464 size_t symbol = text[idx];
467 size_t src = bkt_l[symbol];
468 bkt_l[symbol] = bkt_l[symbol] + 1;
469 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
472 size_t part = src / partsize - 1;
474 cached_array[part].push_back(val);
478 for (
size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(
true); }
481 for (
size_t i = 0; i < sigma; ++i) { bkt_l[i] = bkt_lms[i]; }
490 bkt_s_last = 0, bkt_s_sum = 0;
491 for (
size_t i = 0; i < sigma; ++i)
493 bkt_s_sum += bkt_s[i];
496 bkt_s[i] = bkt_s_sum;
497 bkt_s_last = bkt_s_sum;
501 bkt_s[i] = bkt_s_sum;
503 bkt_lms[i] = bkt_s[i];
508 for (
size_t i = 0; i < sigma; ++i)
510 if (bkt_s[i] > bkt_s_sum / 2)
512 bkt_s_sum = bkt_s[i];
517 size_t partsize = bound_s / parts + 1;
519 std::vector<int_vector_buffer<>> cached_array(parts - 1);
520 for (
size_t i = 0; i < cached_array.size(); ++i)
528 for (
size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
531 for (; pos + 1 > bkt_s[c]; --pos)
537 for (
size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
539 size_t src = cached_array[cur_part][i++];
540 size_t val = cached_array[cur_part][i++];
541 array[src - offset] = val;
543 cached_array[offset / partsize].reset();
546 size_t idx = array[pos - offset];
547 if (idx == 0) { idx = n; }
549 size_t symbol = text[idx];
552 bkt_s[symbol] = bkt_s[symbol] - 1;
554 size_t src = bkt_s[symbol];
555 if (src >= offset) { array[src - offset] = val; }
558 size_t part = src / partsize;
560 cached_array[part].push_back(val);
565 left[left_pointer++] = array[pos - offset];
570 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
572 size_t idx = right[right_pointer--];
573 if (idx == 0) { idx = n; }
575 size_t symbol = text[idx];
576 bkt_s[symbol] = bkt_s[symbol] - 1;
579 size_t src = bkt_s[symbol];
580 if (src >= offset) { array[src - offset] = val; }
583 size_t part = src / partsize;
585 cached_array[part].push_back(val);
589 for (
size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(
true); }
591 for (
size_t i = 0; i < sigma; ++i) { bkt_s[i] = bkt_lms[i]; }
600 size_t last_end_pos = first_lms_pos, order = number_of_lms_strings - 1;
601 same_lms[number_of_lms_strings - 1] =
true;
602 for (
size_t i = number_of_lms_strings - 2, a = 0, b = 0, last_a = left[number_of_lms_strings - 1];
603 i < number_of_lms_strings;
611 if (end_pos - a == last_end_pos - b)
613 while (a < end_pos and text[a] == text[b])
618 if (text[a] == text[b])
624 last_end_pos = end_pos;
630 if (recursion == 0) { text_rec.
width((
bits::hi(order + 1) + 1)); }
635 text_rec.
resize(number_of_lms_strings);
638 if (recursion == 0 and n / 2 * text_rec.
width() > 8 * n)
640 size_t size_of_part = n / 4 + 3;
641 text_rec.
resize(size_of_part);
644 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
646 if (!same_lms[i]) { ++order; }
647 if (left[i] / 2 >= size_of_part) { text_rec[(left[i] / 2) - size_of_part] = order; }
651 for (
size_t i = 0; i < size_of_part; ++i)
653 if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
657 text_rec.
resize(size_of_part);
660 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
662 if (!same_lms[i]) { ++order; }
663 if (left[i] / 2 < size_of_part) { text_rec[left[i] / 2] = order; }
666 for (
size_t i = 0; i < size_of_part; ++i)
668 if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
670 text_rec.
resize(number_of_lms_strings);
672 for (
size_t i = 0; i < buf.
size(); ++i) { text_rec[pos++] = buf[i]; }
674 text_rec[number_of_lms_strings - 1] = 0;
678 text_rec.
resize(n / 2 + 1);
681 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
683 if (!same_lms[i]) { ++order; }
684 text_rec[left[left_pointer--] / 2] = order;
686 for (
size_t i = 0, pos = 0; i < text_rec.
size(); ++i)
688 if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
690 text_rec[number_of_lms_strings - 1] = 0;
691 text_rec.
resize(number_of_lms_strings);
694 util::clear(same_lms);
701 if (text_rec.
size() > order + 1)
706 _construct_sa_se<int_vector<>>(text_rec, filename_sa_rec, order + 1, recursion + 1);
712 for (
size_t i = 0; i < number_of_lms_strings; ++i)
714 text_rec[number_of_lms_strings + i] = text_rec[i];
721 number_of_lms_strings,
722 number_of_lms_strings,
727 text_rec.
resize(number_of_lms_strings);
733 isa_rec = std::move(text_rec);
737 if (isa_rec.
size() > 0)
746 util::init_support(lms_select_support, &lms_pos_b);
748 int_vector<> tmp_left(number_of_lms_strings, 0, int_width);
749 for (
size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
751 size_t idx = isa_rec[i];
752 size_t val = lms_select_support.
select(i + 1);
756 util::clear(lms_select_support);
757 util::clear(lms_pos_b);
758 util::clear(isa_rec);
762 for (; left_pointer < number_of_lms_strings; ++left_pointer)
764 left[left_pointer] = tmp_left[number_of_lms_strings - left_pointer - 1];
767 util::clear(tmp_left);
779 util::init_support(lms_select_support, &lms_pos_b);
782 for (uint64_t i = 0; i < sa_rec_buf.
size(); ++i)
784 uint64_t pos = lms_select_support.
select(sa_rec_buf[i] + 1);
785 left[number_of_lms_strings - 1 - left_pointer++] = pos;
787 sa_rec_buf.
close(
true);
801 size_t sa_pointer = 0;
803 size_t partsize = bkt_l_sum / parts + 1;
805 std::vector<int_vector_buffer<>> cached_array(parts - 1);
806 for (
size_t i = 0; i < cached_array.size(); ++i)
815 for (
size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
818 for (; pos < bkt_l[c]; ++pos)
821 if (pos - offset >= partsize)
824 for (
size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
826 size_t src = cached_array[cur_part][i++];
827 size_t val = cached_array[cur_part][i++];
828 array[src - offset] = val;
830 cached_array[pos / partsize - 1].reset();
832 size_t idx = array[pos - offset];
835 cached_sa[sa_pointer++] = idx;
836 right[right_pointer++] = idx;
840 size_t symbol = text[idx - 1];
841 cached_sa[sa_pointer++] = idx;
844 size_t val = idx - 1;
845 size_t src = bkt_l[symbol];
846 bkt_l[symbol] = bkt_l[symbol] + 1;
847 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
850 size_t part = src / partsize - 1;
852 cached_array[part].push_back(val);
857 right[right_pointer++] = idx;
863 while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
865 size_t idx = left[left_pointer--];
866 if (idx == 0) { idx = n; }
868 size_t symbol = text[idx];
870 size_t src = bkt_l[symbol];
871 bkt_l[symbol] = bkt_l[symbol] + 1;
872 if ((src - offset) / partsize == 0) { array[src - offset] = val; }
875 size_t part = src / partsize - 1;
877 cached_array[part].push_back(val);
881 for (
size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(
true); }
888 size_t partsize = bound_s / parts + 1;
891 std::vector<int_vector_buffer<>> cached_array(parts - 1);
892 for (
size_t i = 0; i < cached_array.size(); ++i)
901 for (
size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
904 assert(c < C.
size());
905 sa_pointer = C[c] - 1;
906 for (; pos + 1 > bkt_s[c]; --pos)
912 for (
size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
914 size_t src = cached_array[cur_part][i++];
915 size_t val = cached_array[cur_part][i++];
916 assert((src - offset) < array.
size());
917 array[src - offset] = val;
919 assert((offset / partsize) < parts - 1);
920 cached_array[offset / partsize].reset();
923 assert((pos - offset) < array.
size());
924 size_t idx = array[pos - offset];
925 if (idx == 0) { idx = n; }
927 assert((idx) < text.size());
928 size_t symbol = text[idx];
931 if (idx == n - 1) { cached_sa[sa_pointer--] = 0; }
934 cached_sa[sa_pointer--] = idx + 1;
936 assert((symbol) < bkt_s.size());
937 bkt_s[symbol] = bkt_s[symbol] - 1;
939 size_t src = bkt_s[symbol];
942 assert((src - offset) < array.
size());
943 array[src - offset] = val;
947 size_t part = src / partsize;
948 assert(part < parts - 1);
949 cached_array[part].push_back(src);
950 cached_array[part].push_back(val);
955 if (idx == n - 1) { cached_sa[sa_pointer--] = 0; }
958 cached_sa[sa_pointer--] = idx + 1;
963 while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
965 size_t idx = right[right_pointer--];
966 if (idx == 0) { idx = n; }
968 size_t symbol = text[idx];
969 assert((symbol) < bkt_s.size());
970 bkt_s[symbol] = bkt_s[symbol] - 1;
973 size_t src = bkt_s[symbol];
976 assert((src - offset) < array.
size());
977 array[src - offset] = val;
981 size_t part = src / partsize;
982 assert((part) < parts - 1);
983 cached_array[part].push_back(src);
984 cached_array[part].push_back(val);
988 for (
size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(
true); }
uint64_t size() const
Returns the number of elements currently stored.
void reset()
Delete all content and set size to 0.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
size_type size() const noexcept
The number of elements in the int_vector.
void push_back(value_type value)
Insert element at the end.
void resize(const size_type size)
Resize the int_vector in terms of elements.
static mm_event_proxy event(const std::string &name)
A class supporting constant time select queries.
size_type select(size_type i) const
Select function.
int_vector.hpp contains the sdsl::int_vector class.
io.hpp contains some methods for reading/writing sdsl structures.
std::string to_string(const T &t, int w=1)
void _set_zero_bits(t_int_vec &v)
Sets all bits of the int_vector to 0-bits.
Namespace for the succinct data structure library.
void _construct_sa_IS(int_vector<> &text, int_vector<> &sa, std::string &filename_sa, size_t n, size_t text_offset, size_t sigma, uint64_t recursion)
uint64_t _get_next_lms_position(int_vector_type &text, uint64_t i)
void _construct_sa_se(int_vector_type &text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
bool load_from_file(T &v, const std::string &file)
Load sdsl-object v from a file.
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
int remove(const std::string &)
Remove a file.
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.