SDSL  3.0.0
Succinct Data Structure Library
construct_lcp.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
8 #ifndef INCLUDED_SDSL_CONSTRUCT_LCP
9 #define INCLUDED_SDSL_CONSTRUCT_LCP
10 
11 #include <algorithm>
12 #include <iostream>
13 #include <stdexcept>
14 
15 #include <sdsl/config.hpp>
16 #include <sdsl/construct_bwt.hpp>
17 #include <sdsl/construct_isa.hpp>
19 #include <sdsl/int_vector.hpp>
20 #include <sdsl/rank_support.hpp>
21 #include <sdsl/select_support.hpp>
22 #include <sdsl/sfstream.hpp>
23 #include <sdsl/util.hpp>
24 #include <sdsl/wt_algorithm.hpp>
25 #include <sdsl/wt_huff.hpp>
26 
27 //#define STUDY_INFORMATIONS
28 
29 namespace sdsl
30 {
31 
32 // Forward declaration of construction method
33 // template <class t_index>
34 // void construct(t_index& idx, const std::string& file);
35 
37 
54 template <uint8_t t_width>
56 {
57  static_assert(t_width == 0 or t_width == 8,
58  "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet");
59  int_vector<> lcp;
61  construct_isa(config);
62  {
64  if (!load_from_cache(text, key_text_trait<t_width>::KEY_TEXT, config)) { return; }
66  std::ios::in,
67  1000000); // init isa file_buffer
68  int_vector<> sa;
69  if (!load_from_cache(sa, conf::KEY_SA, config)) { return; }
70  // use Kasai algorithm to compute the lcp values
71  for (size_type i = 0, j = 0, sa_1 = 0, l = 0, n = isa_buf.size(); i < n; ++i)
72  {
73  sa_1 = isa_buf[i]; // = isa[i]
74  if (sa_1)
75  {
76  j = sa[sa_1 - 1];
77  if (l) --l;
78  assert(i != j);
79  while (text[i + l] == text[j + l])
80  { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j
81  ++l;
82  }
83  sa[sa_1 - 1] = l; // overwrite sa array with lcp values
84  }
85  else
86  {
87  l = 0;
88  sa[n - 1] = 0;
89  }
90  }
91 
92  for (size_type i = sa.size(); i > 1; --i) { sa[i - 1] = sa[i - 2]; }
93  sa[0] = 0;
94  lcp = std::move(sa);
95  }
96  store_to_cache(lcp, conf::KEY_LCP, config);
97 }
98 
100 
115 template <uint8_t t_width>
117 {
118  static_assert(t_width == 0 or t_width == 8,
119  "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
121  typedef int_vector<t_width> text_type;
124  size_type n = sa_buf.size();
125 
126  assert(n > 0);
127  if (1 == n)
128  { // Handle special case: Input only the sentinel character.
129  int_vector<> lcp(1, 0);
130  store_to_cache(lcp, conf::KEY_LCP, config);
131  return;
132  }
133 
134  // (1) Calculate PHI (stored in array plcp)
135  int_vector<> plcp(n, 0, sa_buf.width());
136  for (size_type i = 0, sai_1 = 0; i < n; ++i)
137  {
138  size_type sai = sa_buf[i];
139  plcp[sai] = sai_1;
140  sai_1 = sai;
141  }
142 
143  // (2) Load text from disk
144  text_type text;
145  load_from_cache(text, KEY_TEXT, config);
146 
147  // (3) Calculate permuted LCP array (text order), called PLCP
148  size_type max_l = 0;
149  for (size_type i = 0, l = 0; i < n - 1; ++i)
150  {
151  size_type phii = plcp[i];
152  while (text[i + l] == text[phii + l]) { ++l; }
153  plcp[i] = l;
154  if (l)
155  {
156  max_l = std::max(max_l, l);
157  --l;
158  }
159  }
160  util::clear(text);
161  uint8_t lcp_width = bits::hi(max_l) + 1;
162 
163  // (4) Transform PLCP into LCP
164  std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
165  size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
166  int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp
167  lcp_buf[0] = 0;
168  sa_buf.buffersize(buffer_size);
169  for (size_type i = 1; i < n; ++i)
170  {
171  size_type sai = sa_buf[i];
172  lcp_buf[i] = plcp[sai];
173  }
174  lcp_buf.close();
176 }
177 
179 
196 {
199  size_type n = sa_buf.size();
200  if (1 == n)
201  {
202  int_vector<> lcp(1, 0);
203  store_to_cache(lcp, conf::KEY_LCP, config);
204  return;
205  }
206  const uint8_t log_q = 6; // => q=64
207  const uint32_t q = 1 << log_q;
208  const uint64_t modq = bits::lo_set[log_q];
209 
210  // n-1 is the maximum entry in SA
211  int_vector<64> plcp((n - 1 + q) >> log_q);
212 
213  for (size_type i = 0, sai_1 = 0; i < n; ++i)
214  { // we can start at i=0. if SA[i]%q==0
215  // we set PHI[(SA[i]=n-1)%q]=0, since T[0]!=T[n-1]
216  size_type sai = sa_buf[i];
217  if ((sai & modq) == 0)
218  {
219  if ((sai >> log_q) >= plcp.size())
220  {
221  // std::cerr<<"sai="<<sai<<" log_q="<<log_q<<" sai>>log_q="<<(sai>>log_q)<<"
222  // "<<sai_1<<std::endl; std::cerr<<"n="<<n<<" "<<" plcp.size()="<<plcp.size();
223  }
224  plcp[sai >> log_q] = sai_1;
225  }
226  sai_1 = sai;
227  }
228 
229  int_vector<8> text;
230  load_from_cache(text, conf::KEY_TEXT, config);
231 
232  for (size_type i = 0, j, k, l = 0; i < plcp.size(); ++i)
233  {
234  j = i << log_q; // j=i*q
235  k = plcp[i];
236  while (text[j + l] == text[k + l]) ++l;
237  plcp[i] = l;
238  if (l >= q) { l -= q; }
239  else
240  {
241  l = 0;
242  }
243  }
244 
245  size_type buffer_size = 4000000; // buffer_size is a multiple of 8!
246  sa_buf.buffersize(buffer_size);
248  std::ios::out,
249  buffer_size,
250  sa_buf.width()); // open buffer for plcp
251 
252  for (size_type i = 0, sai_1 = 0, l = 0, sai = 0, iq = 0; i < n; ++i)
253  {
254  /*size_type*/ sai = sa_buf[i];
255  // std::cerr<<"i="<<i<<" sai="<<sai<<std::endl;
256  if ((sai & modq) == 0)
257  { // we have already worked the value out ;)
258  lcp_out_buf[i] = l = plcp[sai >> log_q];
259  }
260  else
261  {
262  /*size_type*/ iq = sai & bits::lo_unset[log_q];
263  l = plcp[sai >> log_q];
264  if (l > (sai - iq))
265  l -= (sai - iq);
266  else
267  l = 0;
268  while (text[sai + l] == text[sai_1 + l]) ++l;
269  lcp_out_buf[i] = l;
270  }
271 #ifdef CHECK_LCP
272  size_type j = 0;
273  for (j = 0; j < l; ++j)
274  {
275  if (text[sai + j] != text[sai_1 + j])
276  {
277  std::cout << "lcp[" << i << "]=" << l << " is two big! " << j << " is right!"
278  << " sai=" << sai << std::endl;
279  if ((sai & modq) != 0)
280  std::cout << " plcp[sai>>log_q]=" << plcp[sai >> log_q] << " sai-iq=" << sai - iq << " sai=" << sai
281  << " sai-iq=" << sai - iq << std::endl;
282  break;
283  }
284  }
285 #endif
286  sai_1 = sai;
287  }
288  lcp_out_buf.close();
290  return;
291 }
292 
294 
312 inline void construct_lcp_go(cache_config & config)
313 {
315 #ifdef STUDY_INFORMATIONS
316  size_type racs = 0; // random accesses to the text
317  size_type matches = 0;
318  size_type comps2 = 0; // comparisons the second phase
319 #endif
320  int_vector<8> text;
321  load_from_cache(text, conf::KEY_TEXT, config);
322  int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
323  const size_type n = sa_buf.size();
324  const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
325 
326  if (1 == n)
327  {
328  int_vector<> lcp(1, 0);
329  store_to_cache(lcp, conf::KEY_LCP, config);
330  return;
331  }
332 
333  size_type cnt_c[257] = { 0 }; // counter for each character in the text
334  size_type cnt_cc[257] = { 0 }; // prefix sum of the counter cnt_c
335  size_type cnt_cc2[257] = { 0 }; //
336  size_type omitted_c[257] = { 0 }; // counts the omitted occurrences for the second phase
337  size_type prev_occ_in_bwt[256] = { 0 }; // position of the previous occurrence of each character c in the bwt
338  for (size_type i = 0; i < 256; ++i) prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
339  unsigned char alphabet[257] = { 0 };
340  uint8_t sigma = 0;
341 
342  tLI m_list[2][256];
343  size_type m_char_count[2] = { 0 };
344  uint8_t m_chars[2][256] = { { 0 }, { 0 } };
345 
346  size_type nn = 0; // n' for phase 2
347  // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
348  {
349 
350  int_vector<8> lcp_sml(n,
351  0); // initialize array for small values of first phase; note lcp[0]=0
352  size_type done_cnt = 0;
353 
354  for (size_type i = 0; i < n; ++i)
355  { // initialize cnt_c
356  ++cnt_c[text[i] + 1];
357  }
358  for (int i = 1; i < 257; ++i)
359  { // calculate sigma and initailize cnt_cc
360  if (cnt_c[i] > 0) { alphabet[sigma++] = (unsigned char)(i - 1); }
361  cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
362  }
363  alphabet[sigma] = '\0';
364  {
365  int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
366  size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
367  uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
368  lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
369  prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
370  ++omitted_c[alphabet[0]]; //
371 
372  int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
373  rmq_stack[0] = 0;
374  rmq_stack[1] = 0; // first element (-1, -1)
375  rmq_stack[2] = 1;
376  rmq_stack[3] = 0; // second element (0, -1)
377  size_type rmq_end = 3; // index of the value of the topmost element
378 
379  const size_type m_mod2 = m % 2;
380  uint8_t cur_c = alphabet[1];
381  size_type big_val = 0;
382  for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
383  {
384  uint8_t bwti = bwt_buf[i];
385  sai = sa_buf[i];
386  size_type lf = cnt_cc[bwti];
387  if (!cur_c_cnt)
388  { // cur_c_cnt==0, if there is no more occurence of the current character
389  if (cur_c_cnt < sigma) { cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1]; }
390  }
391  size_type l = 0;
392  if (i >= cnt_cc[cur_c])
393  { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
394  if (bwti == bwti_1 and lf < i)
395  { // BWT[i]==BWT[i-1]
396  l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
397  if (l == m)
398  { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
399  l += (text[sai_1 + m] == text[sai + m]);
400 #ifdef STUDY_INFORMATIONS
401  if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
402  ++racs;
403 #endif
404  }
405  lcp_sml[i] = l;
406  ++done_cnt;
407  }
408  else
409  { // BWT[i] != BWT[i-1] or LF[i] > i
410  if (lf < i) l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
411 #ifdef STUDY_INFORMATIONS
412  if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
413  ++racs;
414 #endif
415  while (text[sai_1 + l] == text[sai + l] and l < m + 1)
416  {
417  ++l;
418 #ifdef STUDY_INFORMATIONS
419  ++matches;
420 #endif
421  }
422  lcp_sml[i] = l;
423  }
424  }
425  else
426  { // if already done
427  l = lcp_sml[i]; // load LCP value
428  }
429  if (l > m)
430  {
431  ++big_val;
432  if (i > 10000 and i < 10500 and big_val > 3000)
433  { // if most of the values are big: switch to PHI algorithm
434  util::clear(text);
435  util::clear(lcp_sml);
436  construct_lcp_PHI<8>(config);
437  return;
438  }
439  }
440  // invariant: l <= m+1
441  // begin update rmq_stack
442  size_type x = l + 1;
443  size_type j = rmq_end;
444  while (x <= rmq_stack[j]) j -= 2; // pop all elements with value >= l
445  rmq_stack[++j] = i + 1; // push position i
446  rmq_stack[++j] = x; // push value l
447  rmq_end = j; // update index of the value of the topmost element
448  if (lf > i)
449  { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
450  ++done_cnt;
451  // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
452  // rmq is linear in the stack size; can also be implemented with binary search on the stack
453  size_type x_pos = prev_occ_in_bwt[bwti] + 2;
454  j = rmq_end - 3;
455  while (x_pos <= rmq_stack[j]) j -= 2; // search smallest value in the interval I
456  lcp_sml[lf] = rmq_stack[j + 3] -
457  (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
458  }
459  if (l >= m)
460  {
461  if (l == m) push_front_m_index(nn, cur_c, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
462  ++nn;
463  }
464  else
465  ++omitted_c[cur_c];
466 
467  prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
468  ++cnt_cc[bwti]; // update counter and therefore the LF information
469  sai_1 = sai; // update SA[i-1]
470  bwti_1 = bwti; // update BWT[i-1]
471  }
472  }
473  util::clear(text);
474 
475  if (n > 1000 and nn > 5 * (n / 6))
476  { // if we would occupy more space than the PHI algorithm => switch to PHI algorithm
477  util::clear(lcp_sml);
478  construct_lcp_PHI<8>(config);
479  return;
480  }
481  store_to_cache(lcp_sml, "lcp_sml", config);
482  }
483 #ifdef STUDY_INFORMATIONS
484  std::cout << "# n=" << n << " nn=" << nn << " nn/n=" << ((double)nn) / n << std::endl;
485 #endif
486 
487  // phase 2: calculate lcp_big
488  {
489  // std::cout<<"# begin calculating LF' values"<<std::endl;
490  int_vector<> lcp_big(nn,
491  0,
492  bits::hi(n - 1) + 1); // lcp_big first contains adapted LF values and finally the big LCP
493  // values
494  {
495  // initialize lcp_big with adapted LF values
496  bit_vector todo(n, 0); // bit_vector todo indicates which values are >= m in lcp_sml
497  {
498  // initialize bit_vector todo
499  int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
500  for (size_type i = 0; i < n; ++i)
501  {
502  if (lcp_sml_buf[i] >= m) { todo[i] = 1; }
503  }
504  }
505 
506  cnt_cc2[0] = cnt_cc[0] = 0;
507  for (size_type i = 1, omitted_sum = 0; i < 257; ++i)
508  { // initialize cnt_cc and cnt_cc2
509  cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
510  omitted_sum += omitted_c[i - 1];
511  cnt_cc2[i] = cnt_cc[i] - omitted_sum;
512  }
513 
514  int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
515  for (size_type i = 0, i2 = 0; i < n; ++i)
516  {
517  uint8_t b = bwt_buf[i]; // store BWT[i]
518  size_type lf_i = cnt_cc[b]; // LF[i]
519  if (todo[i])
520  { // LCP[i] is a big value
521  if (todo[lf_i])
522  { // LCP[LF[i]] is a big entry
523  lcp_big[i2] = cnt_cc2[b]; // LF'[i]
524  } /*else{
525  * lcp_big[i2] = 0;
526  }*/
527  ++i2;
528  }
529  if (todo[lf_i])
530  { // LCP[LF[i]] is a big entry
531  ++cnt_cc2[b]; // increment counter for adapted LF
532  }
533  ++cnt_cc[b]; // increment counter for LF
534  }
535  }
536 
537  // std::cout<<"# begin initializing bwt2, shift_bwt2, run2"<<std::endl;
538  int_vector<8> bwt2(nn),
539  shift_bwt2(nn); // BWT of big LCP values, and shifted BWT of
540  // big LCP values
541  bit_vector run2(nn + 1); // indicates for each entry i, if i and i-1 are both big LCP values
542  run2[nn] = 0; // index nn is not a big LCP value
543  {
544  // initialize bwt2, shift_bwt2, adj2
545  int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
546  int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
547  uint8_t b_1 = '\0'; // BWT[i-1]
548  bool is_run = false;
549  for (size_type i = 0, i2 = 0; i < n; ++i)
550  {
551  uint8_t b = bwt_buf[i];
552  if (lcp_sml_buf[i] >= m)
553  {
554  bwt2[i2] = b;
555  shift_bwt2[i2] = b_1;
556  run2[i2] = is_run;
557  is_run = true;
558  ++i2;
559  }
560  else
561  {
562  is_run = false;
563  }
564  b_1 = b;
565  }
566  }
567 
568  bit_vector todo2(nn + 1, 1); // init all values with 1, except
569  todo2[nn] = 0; // the last one! (handels case "i < nn")
570 
571  // std::cout<<"# begin calculating m-indices"<<std::endl;
572  {
573  // calculate m-indices, (m+1)-indices,... until we are done
574  size_type m2 = m;
575  size_type char_ex[256];
576  for (size_type i = 0; i < 256; ++i) char_ex[i] = nn;
577  size_type char_occ = 0;
578  size_type m_mod2 = m2 % 2, mm1_mod2 = (m2 + 1) % 2;
579  while (m_char_count[m_mod2] > 0)
580  { // while there are m-indices, calculate (m+1)-indices and write m-indices
581  // For all values LCP[i] >= m2 it follows that todo2[i] == 1
582  // list m_list[mm1_mod2][b] is sorted in decreasing order
583  ++m2;
584  mm1_mod2 = (m2 + 1) % 2, m_mod2 = m2 % 2;
585  m_char_count[m_mod2] = 0;
586 
587  std::sort(m_chars[mm1_mod2],
588  m_chars[mm1_mod2] + m_char_count[mm1_mod2]); // TODO: ersetzen?
589 
590  for (size_type mc = 0; mc < m_char_count[mm1_mod2]; ++mc)
591  { // for every character
592  tLI & mm1_mc_list = m_list[mm1_mod2][m_chars[mm1_mod2][m_char_count[mm1_mod2] - 1 - mc]];
593  // size_type old_i = nn;
594  while (!mm1_mc_list.empty())
595  {
596  size_type i = mm1_mc_list.front(); // i in [0..n-1]
597  mm1_mc_list.pop_front();
598  // For all values LCP[i] >= m-1 it follows that todo2[i] == 1
599  for (size_type k = i; todo2[k]; --k)
600  {
601 #ifdef STUDY_INFORMATIONS
602  ++comps2;
603 #endif
604  uint8_t b = shift_bwt2[k];
605  if (char_ex[b] != i)
606  {
607  char_ex[b] = i;
608  ++char_occ;
609  }
610  if (!run2[k]) break;
611  }
612  for (size_type k = i; todo2[k] and char_occ; ++k)
613  {
614 #ifdef STUDY_INFORMATIONS
615  ++comps2;
616 #endif
617  uint8_t b = bwt2[k];
618  if (char_ex[b] == i)
619  {
620  size_type p = lcp_big[k];
621  push_back_m_index(p, b, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
622  char_ex[b] = nn;
623  --char_occ;
624  }
625  if (!run2[k + 1]) break;
626  }
627  lcp_big[i] = m2 - 1;
628  todo2[i] = 0;
629  // old_i = i;
630  }
631  }
632  }
633  }
634 
635  store_to_cache(lcp_big, "lcp_big", config);
636  } // end phase 2
637 
638  // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
639  // phase 3: merge lcp_sml and lcp_big and save to disk
640  {
641  const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
642  int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
643  config)); // file buffer containing the big LCP values
644  int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
645  std::ios::in,
646  buffer_size); // file buffer containing the small LCP values
648  std::ios::out,
649  buffer_size,
650  lcp_big_buf.width()); // buffer for the resulting LCP array
651  for (size_type i = 0, i2 = 0; i < n; ++i)
652  {
653  size_type l = lcp_sml_buf[i];
654  if (l >= m)
655  { // if l >= m it is stored in lcp_big
656  l = lcp_big_buf[i2];
657  ++i2;
658  }
659  lcp_buf[i] = l;
660  }
661  lcp_buf.close();
662  }
664 
665 #ifdef STUDY_INFORMATIONS
666  std::cout << "# racs: " << racs << std::endl;
667  std::cout << "# matches: " << matches << std::endl;
668  std::cout << "# comps2: " << comps2 << std::endl;
669 #endif
670  return;
671 }
672 
674 
692 inline void construct_lcp_goPHI(cache_config & config)
693 {
695  int_vector<8> text;
696  load_from_cache(text, conf::KEY_TEXT, config); // load text from file system
697  int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
698  const size_type n = sa_buf.size();
699  const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
700 
701  if (1 == n)
702  {
703  int_vector<> lcp(1, 0);
704  store_to_cache(lcp, conf::KEY_LCP, config);
705  return;
706  }
707 
708  size_type cnt_c[257] = { 0 }; // counter for each character in the text
709  size_type cnt_cc[257] = { 0 }; // prefix sum of the counter cnt_c
710  size_type omitted_c[257] = { 0 }; // counts the omitted occurrences for the second phase
711  size_type prev_occ_in_bwt[256] = { 0 }; // position of the previous occurrence of each character c in the bwt
712  for (size_type i = 0; i < 256; ++i) prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
713  unsigned char alphabet[257] = { 0 };
714  uint8_t sigma = 0;
715 
716  size_type nn = 0; // n' for phase 2
717  // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
718  {
719 
720  int_vector<8> lcp_sml(n,
721  0); // initialize array for small values of first phase; note lcp[0]=0
722  size_type done_cnt = 0;
723 
724  for (size_type i = 0; i < n; ++i)
725  { // initialize cnt_c
726  ++cnt_c[text[i] + 1];
727  }
728  for (int i = 1; i < 257; ++i)
729  { // calculate sigma and initailize cnt_cc
730  if (cnt_c[i] > 0) { alphabet[sigma++] = (unsigned char)(i - 1); }
731  cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
732  }
733  alphabet[sigma] = '\0';
734  {
735  int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
736  size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
737  uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
738  lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
739  prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
740  ++omitted_c[alphabet[0]]; //
741 
742  int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
743  rmq_stack[0] = 0;
744  rmq_stack[1] = 0; // first element (-1, -1)
745  rmq_stack[2] = 1;
746  rmq_stack[3] = 0; // second element (0, -1)
747  size_type rmq_end = 3; // index of the value of the topmost element
748 
749  uint8_t cur_c = alphabet[1];
750  for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
751  {
752  uint8_t bwti = bwt_buf[i];
753  sai = sa_buf[i];
754  size_type lf = cnt_cc[bwti];
755  if (!cur_c_cnt)
756  { // cur_c_cnt==0, if there is no more occurence of the current character
757  if (cur_c_cnt < sigma) { cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1]; }
758  }
759  size_type l = 0;
760  if (i >= cnt_cc[cur_c])
761  { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
762  if (bwti == bwti_1 and lf < i)
763  { // BWT[i]==BWT[i-1]
764  l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
765  if (l == m)
766  { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
767  l += (text[sai_1 + m] == text[sai + m]);
768  }
769  lcp_sml[i] = l;
770  ++done_cnt;
771  }
772  else
773  { // BWT[i] != BWT[i-1] or LF[i] > i
774  if (lf < i) l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
775  while (text[sai_1 + l] == text[sai + l] and l < m + 1) { ++l; }
776  lcp_sml[i] = l;
777  }
778  }
779  else
780  { // if already done
781  l = lcp_sml[i]; // load LCP value
782  }
783  // invariant: l <= m+1
784  // begin update rmq_stack
785  size_type x = l + 1;
786  size_type j = rmq_end;
787  while (x <= rmq_stack[j]) j -= 2; // pop all elements with value >= l
788  rmq_stack[++j] = i + 1; // push position i
789  rmq_stack[++j] = x; // push value l
790  rmq_end = j; // update index of the value of the topmost element
791  if (lf > i)
792  { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
793  ++done_cnt;
794  // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
795  // rmq is linear in the stack size; can also be implemented with binary search on the stack
796  size_type x_pos = prev_occ_in_bwt[bwti] + 2;
797  j = rmq_end - 3;
798  while (x_pos <= rmq_stack[j]) j -= 2; // search smallest value in the interval I
799  lcp_sml[lf] = rmq_stack[j + 3] -
800  (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
801  }
802  if (l > m) { ++nn; }
803  else
804  ++omitted_c[cur_c];
805 
806  prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
807  ++cnt_cc[bwti]; // update counter and therefore the LF information
808  sai_1 = sai; // update SA[i-1]
809  bwti_1 = bwti; // update BWT[i-1]
810  }
811  }
812  store_to_cache(lcp_sml, "lcp_sml", config);
813  }
814 
815  // phase 2: calculate lcp_big with PHI algorithm on remaining entries of LCP
816  {
817  int_vector<> lcp_big(0, 0, bits::hi(n - 1) + 1); // nn, 0, bits::hi(n-1)+1);
818  {
819 
820  memory_monitor::event("lcp-init-phi-begin");
821  size_type sa_n_1 = 0; // value for SA[n-1]
822  bit_vector todo(n, 0); // bit_vector todo indicates which values are > m in lcp_sml
823  {
824  // initialize bit_vector todo
825  int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
826  for (size_type i = 0; i < n; ++i)
827  {
828  if (lcp_sml_buf[i] > m) { todo[sa_buf[i]] = 1; }
829  }
830  sa_n_1 = sa_buf[n - 1];
831  }
832  rank_support_v<> todo_rank(&todo); // initialize rank for todo
833 
834  const size_type bot = sa_n_1;
835  int_vector<> phi(nn, bot, bits::hi(n - 1) + 1); // phi
836 
837  int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
838  int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
839  uint8_t b_1 = 0;
840  for (size_type i = 0, sai_1 = 0; i < n; ++i)
841  { // initialize phi
842  uint8_t b = bwt_buf[i]; // store BWT[i]
843  size_type sai = sa_buf[i];
844  if (lcp_sml_buf[i] > m and b != b_1)
845  { // if i is a big irreducable value
846  phi[todo_rank(sai)] = sai_1;
847  } // otherwise phi is equal to bot
848  b_1 = b;
849  sai_1 = sai;
850  }
851  memory_monitor::event("lcp-init-phi-end");
852 
853  memory_monitor::event("lcp-calc-plcp-begin");
854  for (size_type i = 0, ii = 0, l = m + 1, p = 0; i < n and ii < nn; ++i)
855  { // execute compact Phi algorithm
856  if (todo[i])
857  {
858  if (i > 0 and todo[i - 1])
859  l = l - 1;
860  else
861  l = m + 1;
862  if ((p = phi[ii]) != bot)
863  {
864  while (text[i + l] == text[p + l]) ++l;
865  }
866  phi[ii++] = l;
867  }
868  }
869  memory_monitor::event("lcp-calc-plcp-end");
870  util::clear(text);
871 
872  memory_monitor::event("lcp-calc-lcp-begin");
873  lcp_big.resize(nn);
874  for (size_type i = 0, ii = 0; i < n and ii < nn; ++i)
875  {
876  if (lcp_sml_buf[i] > m) { lcp_big[ii++] = phi[todo_rank(sa_buf[i])]; }
877  }
878  memory_monitor::event("lcp-calc-lcp-end");
879  }
880  store_to_cache(lcp_big, "lcp_big", config);
881  } // end phase 2
882 
883  // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
884  // phase 3: merge lcp_sml and lcp_big and save to disk
885  {
886  const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
887  int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
888  config)); // file buffer containing the big LCP values
889  int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
890  std::ios::in,
891  buffer_size); // file buffer containing the small LCP values
893  std::ios::out,
894  buffer_size,
895  lcp_big_buf.width()); // file buffer for the resulting LCP array
896 
897  for (size_type i = 0, i2 = 0; i < n; ++i)
898  {
899  size_type l = lcp_sml_buf[i];
900  if (l > m)
901  { // if l > m it is stored in lcp_big
902  l = lcp_big_buf[i2];
903  ++i2;
904  }
905  lcp_buf[i] = l;
906  }
907  lcp_big_buf.close(true); // close buffer and remove file
908  lcp_sml_buf.close(true); // close buffer and remove file
909  }
911  return;
912 }
913 
915 
930 template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
932 {
934  std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
935 
936  // create WT
937  memory_monitor::event("lcp-bwt-create-wt-huff-begin");
938  t_wt wt_bwt;
939  construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
940  uint64_t n = wt_bwt.size();
941  memory_monitor::event("lcp-bwt-create-wt-huff-end");
942 
943  // init
944  memory_monitor::event("lcp-bwt-init-begin");
945  size_type lcp_value = 0; // current LCP value
946  size_type lcp_value_offset = 0; // Largest LCP value in LCP array, that was written on disk
947  size_type phase = 0; // Count how often the LCP array was written on disk
948 
949  size_type intervals = 0; // number of intervals which are currently stored
950  size_type intervals_new = 0; // number of new intervals
951 
952  std::queue<size_type> q; // Queue for storing the intervals
953  std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
954  size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
955  bool queue_used = true;
956  size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
957  // else use dictionary and wavelet tree
958 
959  size_type quantity; // quantity of characters in interval
960  std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
961  std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
962  std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
963 
964  // Calculate how many bit are for each lcp value available, to limit the memory usage to 20n bit = 2,5n byte, use at
965  // moste 8 bit
966  size_type bb = (n * 20 - size_in_bytes(wt_bwt) * 8 * 1.25 - 5 * n) /
967  n; // 20n - size of wavelet tree * 1.25 for rank support - 5n for bit arrays - n for index_done array
968  if (n * 20 < size_in_bytes(wt_bwt) * 8 * 1.25 + 5 * n) { bb = 6; }
969  bb = std::min(bb, (size_type)8);
970 
971  size_type lcp_value_max = (1ULL << bb) - 1; // Largest LCP value that can be stored into the LCP array
972  size_type space_in_bit_for_lcp = n * bb; // Space for the LCP array in main memory
973 
974 #ifdef STUDY_INFORMATIONS
975  std::cout << "# l=" << n << " b=" << (int)bb << " lcp_value_max=" << lcp_value_max
976  << " size_in_bytes(wt_bwt<v,bs,bs>)=" << size_in_bytes(wt_bwt) << std::endl;
977 #endif
978 
979  // init partial_lcp
980  int_vector<> partial_lcp(n, 0, bb); // LCP array
981 
982  // init index_done
983  bit_vector index_done(n + 1, false); // bit_vector indicates if entry LCP[i] is amend
984  rank_support_v<> ds_rank_support; // Rank support for bit_vector index_done
985 
986  // create C-array
987  std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
988  create_C_array(C, wt_bwt);
989  memory_monitor::event("lcp-bwt-init-begin-end");
990  // calculate lcp
991  memory_monitor::event("lcp-bwt-calc-values-begin");
992 
993  // calculate first intervals
994  partial_lcp[0] = 0;
995  index_done[0] = true;
996  interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
997  for (size_type i = 0; i < quantity; ++i)
998  {
999  unsigned char c = cs[i];
1000  size_type a_new = C[c] + rank_c_i[i];
1001  size_type b_new = C[c] + rank_c_j[i];
1002 
1003  // Save LCP value if not seen before
1004  if (!index_done[b_new])
1005  {
1006  if (b_new < n) partial_lcp[b_new] = lcp_value;
1007  index_done[b_new] = true;
1008  // Save interval
1009  q.push(a_new);
1010  q.push(b_new);
1011  ++intervals;
1012  }
1013  }
1014  ++lcp_value;
1015 
1016  // calculate LCP values phase by phase
1017  while (intervals)
1018  {
1019  if (intervals < use_queue_and_wt && !queue_used)
1020  {
1021  memory_monitor::event("lcp-bwt-bitvec2queue-begin");
1022  util::clear(dict[target]);
1023 
1024  // copy from bitvector to queue
1025  size_type a2 = util::next_bit(dict[source], 0);
1026  size_type b2 = util::next_bit(dict[source], a2 + 1);
1027  while (b2 < dict[source].size())
1028  {
1029  q.push((a2 - 1) >> 1);
1030  q.push(b2 >> 1);
1031  // get next interval
1032  a2 = util::next_bit(dict[source], b2 + 1);
1033  b2 = util::next_bit(dict[source], a2 + 1);
1034  }
1035  util::clear(dict[source]);
1036  memory_monitor::event("lcp-bwt-bitvec2queue-end");
1037  }
1038  if (intervals >= use_queue_and_wt && queue_used)
1039  {
1040  memory_monitor::event("lcp-bwt-queue2bitvec-begin");
1041  dict[source].resize(2 * (n + 1));
1042 
1043  util::set_to_value(dict[source], 0);
1044  // copy from queue to bitvector
1045  while (!q.empty())
1046  {
1047  dict[source][(q.front() << 1) + 1] = 1;
1048  q.pop();
1049  dict[source][(q.front() << 1)] = 1;
1050  q.pop();
1051  }
1052  dict[target].resize(2 * (n + 1));
1053 
1054  util::set_to_value(dict[target], 0);
1055  memory_monitor::event("lcp-bwt-queue2bitvec-end");
1056  }
1057 
1058  if (intervals < use_queue_and_wt)
1059  {
1060  queue_used = true;
1061  intervals_new = 0;
1062  while (intervals)
1063  {
1064  // get next interval
1065  size_type a = q.front();
1066  q.pop();
1067  size_type b = q.front();
1068  q.pop();
1069  --intervals;
1070 
1071  interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1072  for (size_type i = 0; i < quantity; ++i)
1073  {
1074  unsigned char c = cs[i];
1075  size_type a_new = C[c] + rank_c_i[i];
1076  size_type b_new = C[c] + rank_c_j[i];
1077 
1078  // Save LCP value if not seen before
1079  if (!index_done[b_new] and phase == 0)
1080  {
1081  partial_lcp[b_new] = lcp_value;
1082  index_done[b_new] = true;
1083  // Save interval
1084  q.push(a_new);
1085  q.push(b_new);
1086  ++intervals_new;
1087  }
1088  else if (!index_done[b_new])
1089  {
1090  size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1091  if (!partial_lcp[insert_pos])
1092  {
1093  partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1094  // Save interval
1095  q.push(a_new);
1096  q.push(b_new);
1097  ++intervals_new;
1098  }
1099  }
1100  }
1101  }
1102  intervals = intervals_new;
1103  }
1104  else
1105  {
1106  queue_used = false;
1107  intervals = 0;
1108 
1109  // get next interval
1110  size_type a2 = util::next_bit(dict[source], 0);
1111  size_type b2 = util::next_bit(dict[source], a2 + 1);
1112 
1113  while (b2 < dict[source].size())
1114  {
1115  interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1116  for (size_type i = 0; i < quantity; ++i)
1117  {
1118  unsigned char c = cs[i];
1119  size_type a_new = C[c] + rank_c_i[i];
1120  size_type b_new = C[c] + rank_c_j[i];
1121  // Save LCP value if not seen before
1122  if (!index_done[b_new] and phase == 0)
1123  {
1124  partial_lcp[b_new] = lcp_value;
1125  index_done[b_new] = true;
1126  // Save interval
1127  dict[target][(a_new << 1) + 1] = 1;
1128  dict[target][(b_new << 1)] = 1;
1129  ++intervals;
1130  }
1131  else if (!index_done[b_new])
1132  {
1133  size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1134  if (!partial_lcp[insert_pos])
1135  {
1136  partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1137  // Save interval
1138  dict[target][(a_new << 1) + 1] = 1;
1139  dict[target][(b_new << 1)] = 1;
1140  ++intervals;
1141  }
1142  }
1143  }
1144  // get next interval
1145  a2 = util::next_bit(dict[source], b2 + 1);
1146  b2 = util::next_bit(dict[source], a2 + 1);
1147  }
1148  std::swap(source, target);
1149  util::set_to_value(dict[target], 0);
1150  }
1151  ++lcp_value;
1152  if (lcp_value >= lcp_value_max)
1153  {
1154  memory_monitor::event("lcp-bwt-write-to-file-begin");
1155  if (phase) { insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset); }
1156  else
1157  {
1158  store_to_file(partial_lcp, lcp_file);
1159  }
1160  memory_monitor::event("lcp-bwt-write-to-file-end");
1161  memory_monitor::event("lcp-bwt-resize-variables-begin");
1162  util::init_support(ds_rank_support, &index_done); // Create rank support
1163 
1164  // Recalculate lcp_value_max and resize partial_lcp
1165  lcp_value_offset = lcp_value_max - 1;
1166  size_type remaining_lcp_values = index_done.size() - ds_rank_support.rank(index_done.size());
1167 
1168  uint8_t int_width_new = std::min(space_in_bit_for_lcp / remaining_lcp_values,
1169  (size_type)bits::hi(n - 1) + 1);
1170  lcp_value_max = lcp_value_offset + (1ULL << int_width_new);
1171 #ifdef STUDY_INFORMATIONS
1172  std::cout << "# l=" << remaining_lcp_values << " b=" << (int)int_width_new
1173  << " lcp_value_max=" << lcp_value_max << std::endl;
1174 #endif
1175  // partial_lcp = int_vector<>(remaining_lcp_values, 0, int_width_new);
1176  partial_lcp.width(int_width_new);
1177  partial_lcp.resize(remaining_lcp_values);
1178  partial_lcp.shrink_to_fit();
1179  util::set_to_value(partial_lcp, 0);
1180  ++phase;
1181  memory_monitor::event("lcp-bwt-resize-variables-end");
1182  }
1183  }
1184  memory_monitor::event("lcp-bwt-calc-values-end");
1185 
1186  // merge to file
1187  memory_monitor::event("lcp-bwt-merge-to-file-begin");
1188  if (phase) { insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset); }
1189  else
1190  {
1191  store_to_file(partial_lcp, lcp_file);
1192  }
1194 
1195  memory_monitor::event("lcp-bwt-merge-to-file-end");
1196 }
1197 
1199 
1214 template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
1216 {
1218 
1219  uint64_t n; // Input length
1220  size_type buffer_size = 1000000; // Size of the buffer
1221  size_type lcp_value = 0; // current LCP value
1222  std::string tmp_lcp_file = cache_file_name(conf::KEY_LCP, config) + "_tmp";
1223  // (1) Calculate LCP-Positions-Array: For each lcp_value (in ascending order) all its occurrences (in any order) in
1224  // the lcp array
1225  {
1226  memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1227  t_wt wt_bwt;
1228  construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
1229  n = wt_bwt.size();
1230  memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1231 
1232  // Declare needed variables
1233  memory_monitor::event("lcp-bwt2-init-begin");
1234  size_type intervals = 0; // Number of intervals which are currently stored
1235  size_type intervals_new = 0; // Number of new intervals
1236 
1237  std::queue<size_type> q; // Queue for storing the intervals
1238  std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
1239  size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
1240  bool queue_used = true; // Defines whether a queue (true) or the bit_vectors (false) was used to store intervals
1241  size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
1242  // else use dictionary and wavelet tree
1243 
1244  size_type quantity; // quantity of characters in interval
1245  std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
1246  std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
1247  std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
1248 
1249  // External storage of LCP-Positions-Array
1250  bool new_lcp_value = false;
1251  uint8_t int_width = bits::hi(n) + 2;
1252  int_vector_buffer<> lcp_positions_buf(tmp_lcp_file,
1253  std::ios::out,
1254  buffer_size,
1255  int_width); // Create buffer for positions of LCP entries
1256  size_type idx_out_buf = 0;
1257  bit_vector index_done(n + 1, 0); // Bitvector which is true, if corresponding LCP value was already calculated
1258 
1259  // Create C-array
1260  std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
1261  create_C_array(C, wt_bwt);
1262  memory_monitor::event("lcp-bwt2-init-end");
1263  // Calculate LCP-Positions-Array
1264  memory_monitor::event("lcp-bwt2-calc-values-begin");
1265 
1266  // Save position of first LCP-value
1267  lcp_positions_buf[idx_out_buf++] = 0;
1268  if (new_lcp_value)
1269  {
1270  lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1271  new_lcp_value = false;
1272  }
1273  index_done[0] = true;
1274 
1275  // calculate first intervals
1276  interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
1277  for (size_type i = 0; i < quantity; ++i)
1278  {
1279  unsigned char c = cs[i];
1280  size_type a_new = C[c] + rank_c_i[i];
1281  size_type b_new = C[c] + rank_c_j[i];
1282 
1283  // Save LCP value and corresponding interval if not seen before
1284  if (!index_done[b_new])
1285  {
1286  if (b_new < n)
1287  {
1288  // Save position of LCP-value
1289  lcp_positions_buf[idx_out_buf++] = b_new;
1290  }
1291  index_done[b_new] = true;
1292 
1293  // Save interval
1294  q.push(a_new);
1295  q.push(b_new);
1296  ++intervals;
1297  }
1298  }
1299  ++lcp_value;
1300  new_lcp_value = true;
1301 
1302  // Calculate LCP positions
1303  while (intervals)
1304  {
1305  if (intervals < use_queue_and_wt && !queue_used)
1306  {
1307  memory_monitor::event("lcp-bwt2-bitvec2queue-begin");
1308  util::clear(dict[target]);
1309 
1310  // Copy from bitvector to queue
1311  size_type a2 = util::next_bit(dict[source], 0);
1312  size_type b2 = util::next_bit(dict[source], a2 + 1);
1313  while (b2 < dict[source].size())
1314  {
1315  q.push((a2 - 1) >> 1);
1316  q.push(b2 >> 1);
1317  // Get next interval
1318  a2 = util::next_bit(dict[source], b2 + 1);
1319  b2 = util::next_bit(dict[source], a2 + 1);
1320  }
1321  util::clear(dict[source]);
1322  memory_monitor::event("lcp-bwt2-bitvec2queue-end");
1323  }
1324  if (intervals >= use_queue_and_wt && queue_used)
1325  {
1326  memory_monitor::event("lcp-bwt2-queue2bitvec-begin");
1327  dict[source].resize(2 * (n + 1));
1328  util::set_to_value(dict[source], 0);
1329  // Copy from queue to bitvector
1330  while (!q.empty())
1331  {
1332  dict[source][(q.front() << 1) + 1] = 1;
1333  q.pop();
1334  dict[source][(q.front() << 1)] = 1;
1335  q.pop();
1336  }
1337  dict[target].resize(2 * (n + 1));
1338  util::set_to_value(dict[target], 0);
1339  memory_monitor::event("lcp-bwt2-queue2bitvec-end");
1340  }
1341 
1342  if (intervals < use_queue_and_wt)
1343  {
1344  queue_used = true;
1345  intervals_new = 0;
1346  while (intervals)
1347  {
1348  // Get next interval
1349  size_type a = q.front();
1350  q.pop();
1351  size_type b = q.front();
1352  q.pop();
1353  --intervals;
1354 
1355  interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1356  for (size_type i = 0; i < quantity; ++i)
1357  {
1358  unsigned char c = cs[i];
1359  size_type a_new = C[c] + rank_c_i[i];
1360  size_type b_new = C[c] + rank_c_j[i];
1361 
1362  // Save LCP value and corresponding interval if not seen before
1363  if (!index_done[b_new])
1364  {
1365  // Save position of LCP-value
1366  lcp_positions_buf[idx_out_buf++] = b_new;
1367  if (new_lcp_value)
1368  {
1369  // Mark new LCP-value
1370  lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1371  new_lcp_value = false;
1372  }
1373  index_done[b_new] = true;
1374 
1375  // Save interval
1376  q.push(a_new);
1377  q.push(b_new);
1378  ++intervals_new;
1379  }
1380  }
1381  }
1382  intervals = intervals_new;
1383  }
1384  else
1385  {
1386  queue_used = false;
1387  intervals = 0;
1388  // get next interval
1389  size_type a2 = util::next_bit(dict[source], 0);
1390  size_type b2 = util::next_bit(dict[source], a2 + 1);
1391 
1392  while (b2 < dict[source].size())
1393  {
1394  interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1395  for (size_type i = 0; i < quantity; ++i)
1396  {
1397  unsigned char c = cs[i];
1398  size_type a_new = C[c] + rank_c_i[i];
1399  size_type b_new = C[c] + rank_c_j[i];
1400  // Save LCP value if not seen before
1401  if (!index_done[b_new])
1402  {
1403  // Save position of LCP-value
1404  lcp_positions_buf[idx_out_buf++] = b_new;
1405  if (new_lcp_value)
1406  {
1407  // Mark new LCP-value
1408  lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1409  new_lcp_value = false;
1410  }
1411  index_done[b_new] = true;
1412  // Save interval
1413  dict[target][(a_new << 1) + 1] = 1;
1414  dict[target][(b_new << 1)] = 1;
1415  ++intervals;
1416  }
1417  }
1418  // get next interval
1419  a2 = util::next_bit(dict[source], b2 + 1);
1420  b2 = util::next_bit(dict[source], a2 + 1);
1421  }
1422  std::swap(source, target);
1423  util::set_to_value(dict[target], 0);
1424  }
1425  ++lcp_value;
1426  new_lcp_value = true;
1427  }
1428  memory_monitor::event("lcp-bwt2-calc-values-end");
1429  lcp_positions_buf.close();
1430  }
1431  // (2) Insert LCP entires into LCP array
1432  {
1433  memory_monitor::event("lcp-bwt2-reordering-begin");
1434 
1435  int_vector_buffer<> lcp_positions(tmp_lcp_file, std::ios::in, buffer_size);
1436 
1437  uint8_t int_width = bits::hi(lcp_value + 1) + 1; // How many bits are needed for one lcp_value?
1438 
1439  // Algorithm does r=ceil(int_width/8) runs over LCP-Positions-Array.
1440  // So in each run k>=(n/r) values of the lcp array must be calculated.
1441  // Because k has to be a multiple of 8, we choose number_of_values = (k+16) - ((k+16)%8)
1442  size_type number_of_values = ((n / ((int_width - 1ULL) / 8 + 1) + 16) & (~(0x7ULL)));
1443  std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
1444  int_vector_buffer<> lcp_array(lcp_file,
1445  std::ios::out,
1446  number_of_values * int_width / 8,
1447  int_width); // Create Output Buffer
1448  number_of_values = lcp_array.buffersize() * 8 / int_width;
1449 
1450  for (size_type position_begin = 0, position_end = number_of_values; position_begin < n and number_of_values > 0;
1451  position_begin = position_end, position_end += number_of_values)
1452  {
1453 #ifdef STUDY_INFORMATIONS
1454  std::cout << "# number_of_values=" << number_of_values << " fill lcp_values with " << position_begin
1455  << " <= position <" << position_end << ", each lcp-value has " << (int)int_width
1456  << " bit, lcp_value_max=" << lcp_value << " n=" << n << std::endl;
1457 #endif
1458  lcp_value = 0;
1459  for (size_type i = 0; i < n; ++i)
1460  {
1461  size_type position = lcp_positions[i];
1462  if (position > n)
1463  {
1464  position -= n;
1465  ++lcp_value;
1466  }
1467  if (position_begin <= position and position < position_end) { lcp_array[position] = lcp_value; }
1468  }
1469  }
1470  // Close file
1471  lcp_array.close();
1473  lcp_positions.close(true); // close buffer and remove file
1474  memory_monitor::event("lcp-bwt2-reordering-end");
1475  } // End of phase 2
1476 }
1477 
1478 } // namespace sdsl
1479 
1480 #endif
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
uint8_t width() const
Returns the width of the integers which are accessed via the [] operator.
void shrink_to_fit()
Free unused allocated memory.
Definition: int_vector.hpp:530
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:619
size_type size() const noexcept
The number of elements in the int_vector.
reference front() noexcept
Returns first element.
Definition: int_vector.hpp:449
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:545
static mm_event_proxy event(const std::string &name)
A rank structure proposed by Sebastiano Vigna.
size_type rank(size_type idx) const
Answers rank queries for the supported bit_vector.
construct_bwt.hpp contains a space and time efficient construction method for the Burrows and Wheeler...
construct_isa.hpp contains a space and time efficient construction method for the inverse suffix arra...
int_vector.hpp contains the sdsl::int_vector class.
constexpr char KEY_SA[]
Definition: config.hpp:37
constexpr char KEY_TEXT[]
Definition: config.hpp:41
constexpr char KEY_LCP[]
Definition: config.hpp:44
constexpr char KEY_ISA[]
Definition: config.hpp:40
constexpr char KEY_BWT[]
Definition: config.hpp:35
int_vector ::size_type size_type
Get the smallest position f $i geq idx f where a bit is set t_int_vec::size_type next_bit(const t_int_vec &v, uint64_t idx)
void set_to_value(t_int_vec &v, uint64_t k)
Set all entries of int_vector to value k.
Definition: util.hpp:566
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool load_from_cache(T &v, const std::string &key, const cache_config &config, bool add_type_hash=false)
Definition: io.hpp:711
void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
void interval_symbols(const t_wt &wt, typename t_wt::size_type i, typename t_wt::size_type j, typename t_wt::size_type &k, std::vector< typename t_wt::value_type > &cs, std::vector< typename t_wt::size_type > &rank_c_i, std::vector< typename t_wt::size_type > &rank_c_j)
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
void construct_lcp_kasai(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void construct_lcp_bwt_based(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void create_C_array(std::vector< uint64_t > &C, const tWT &wt)
void insert_lcp_values(int_vector<> &partial_lcp, bit_vector &index_done, std::string lcp_file, uint64_t max_lcp_value, uint64_t lcp_value_offset)
Merges a partial LCP array into the LCP array on disk.
void construct_lcp_PHI(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void swap(int_vector_reference< t_int_vector > x, int_vector_reference< t_int_vector > y) noexcept
Definition: int_vector.hpp:970
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition: construct.hpp:45
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_lcp_goPHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_isa(cache_config &config)
void construct_lcp_bwt_based2(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
bool store_to_cache(const T &v, const std::string &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition: io.hpp:737
void construct_lcp_go(cache_config &config)
Construct the LCP array (only for byte strings)
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
std::list< int_vector<>::size_type > tLI
void push_front_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
T::size_type size_in_bytes(const T &t)
Get the size of a data structure in bytes.
Definition: io.hpp:778
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
sfstream.hpp contains a two stream class which can be used to read/write from/to files or strings.
constexpr static uint64_t lo_unset[65]
lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
Definition: bits.hpp:220
constexpr static uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition: bits.hpp:197
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661
Helper class for construction process.
Definition: config.hpp:67
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition: config.hpp:91
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.