SDSL  3.0.0
Succinct Data Structure Library
construct_sa_se.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
4 #ifndef SDSL_CONSTRUCT_SA_SE
5 #define SDSL_CONSTRUCT_SA_SE
6 
7 #include <cmath>
8 #include <fstream>
9 #include <iostream>
10 #include <string>
11 
12 #include <sdsl/int_vector.hpp>
13 #include <sdsl/io.hpp>
14 #include <sdsl/rank_support.hpp>
15 #include <sdsl/select_support.hpp>
16 
17 namespace sdsl
18 {
19 
20 template <class int_vector_type>
21 uint64_t _get_next_lms_position(int_vector_type & text, uint64_t i)
22 {
23  if (i >= text.size() - 3) { return text.size() - 1; }
24  // text[i] is S-TYP or L-TYP
25  uint64_t ci = text[i], cip1 = text[i + 1];
26  while (ci <= cip1)
27  {
28  ++i;
29  ci = cip1;
30  cip1 = text[i + 1];
31  }
32  // text[i] is L-TYP or S-TYP
33  uint64_t candidate = i + 1;
34  while (ci >= cip1)
35  {
36  if (ci > cip1)
37  {
38  if (i + 1 == text.size() - 1) { return text.size() - 1; }
39  candidate = i + 1;
40  }
41  ++i;
42  ci = cip1;
43  cip1 = text[i + 1];
44  }
45  return candidate;
46 }
47 
48 inline void _construct_sa_IS(int_vector<> & text,
49  int_vector<> & sa,
50  std::string & filename_sa,
51  size_t n,
52  size_t text_offset,
53  size_t sigma,
54  uint64_t recursion)
55 {
56  uint64_t buffersize = 1024 * 1024 / 8;
57 
58  size_t name = 0;
59  size_t number_of_lms_strings = 0;
60  std::string filename_c_array = tmp_file(filename_sa, "_c_array" + util::to_string(recursion));
61  // Phase 1
62  {
63  std::vector<uint64_t> bkt(sigma, 0);
64  // Step 1 - Count characters into c array
65  // TODO: better create this in higher recursion-level
66  for (size_t i = 0; i < n; ++i) { ++bkt[text[text_offset + i]]; }
67 
68  // Step 1.5 save them into cached_external_array
69  int_vector_buffer<> c_array(filename_c_array, std::ios::out, buffersize, 64);
70  for (size_t c = 0; c < sigma; ++c) { c_array[c] = bkt[c]; }
71 
72  // Step 2 Calculate End-Pointer of Buckets
73  bkt[0] = 0;
74  for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + bkt[c]; }
75 
76  // Step 3 - Insert S*-positions into correct bucket of SA but not in correct order inside the buckets
77  for (size_t i = n - 2, was_s_typ = 1; i < n; --i)
78  {
79  if (text[text_offset + i] > text[text_offset + i + 1])
80  {
81  if (was_s_typ)
82  {
83  sa[bkt[text[text_offset + i + 1]]--] = i + 1;
84  ++number_of_lms_strings;
85  was_s_typ = 0;
86  }
87  }
88  else if (text[text_offset + i] < text[text_offset + i + 1])
89  {
90  was_s_typ = 1;
91  }
92  }
93 
94  // Step 4 - Calculate Begin-Pointer of Buckets
95  bkt[0] = 0;
96  for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c - 1]; }
97 
98  // Step 5 - Scan from Left-To-Right to induce L-Types
99  for (size_t i = 0; i < n; ++i)
100  {
101  if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
102  { // faster than if(sa[i]>0 and bkt_beg[text[ sa[i]-1 ]] > i)
103  sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
104  sa[i] = 0;
105  }
106  }
107 
108  // Step 6 - Scan from Right-To-Left to induce S-Types
109  bkt[0] = 0;
110  for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
111  c_array.close();
112 
113  for (size_t i = n - 1, endpointer = n; i < n; --i)
114  {
115  if (sa[i] > 0)
116  {
117  if (text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
118  { // faster than if(bkt_end[text[ sa[i]-1 ]] < i)
119  sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
120  }
121  else
122  {
123  sa[--endpointer] = sa[i];
124  }
125  sa[i] = 0;
126  }
127  }
128 
129  // Step 7 - Determine length of LMS-Strings
130  for (size_t i = n - 2, end = n - 2, was_s_typ = 1; i < n; --i)
131  {
132  if (text[text_offset + i] > text[text_offset + i + 1])
133  {
134  if (was_s_typ)
135  {
136  sa[(i + 1) >> 1] = end - i;
137  end = i + 1;
138  was_s_typ = 0;
139  }
140  }
141  else if (text[text_offset + i] < text[text_offset + i + 1])
142  {
143  was_s_typ = 1;
144  }
145  }
146 
147  // Step 8 - Rename
148  for (size_t i = n - number_of_lms_strings + 1, cur_pos = 0, cur_len = 0, last_pos = n - 1, last_len = 1; i < n;
149  ++i)
150  {
151  cur_pos = sa[i];
152  cur_len = sa[(cur_pos >> 1)];
153  if (cur_len == last_len)
154  {
155  size_t l = 0;
156  while (l < cur_len and text[text_offset + cur_pos + l] == text[text_offset + last_pos + l]) { ++l; }
157  if (l >= cur_len) { --name; }
158  }
159  sa[(cur_pos >> 1)] = ++name;
160  last_pos = cur_pos;
161  last_len = cur_len;
162  }
163  }
164 
165  // Step 9 - Calculate SA of new string - in most cases recursive
166  if (name + 1 < number_of_lms_strings)
167  {
168  // Move Names to the end
169  for (size_t i = 0, t = n - number_of_lms_strings; i < (n >> 1); ++i)
170  {
171  if (sa[i] > 0)
172  {
173  sa[t++] = sa[i];
174  sa[i] = 0;
175  }
176  }
177  sa[n - 1] = 0;
178 
179  // Recursive call
180  std::string filename_sa_rec = tmp_file(filename_sa, "_sa_rec" + util::to_string(recursion + 1));
181  _construct_sa_IS(sa,
182  sa,
183  filename_sa_rec,
184  number_of_lms_strings,
185  n - number_of_lms_strings,
186  name + 1,
187  recursion + 1);
188 
189  for (size_t i = n - 2, endpointer = n - 1, was_s_typ = 1; i < n; --i)
190  {
191  if (text[text_offset + i] > text[text_offset + i + 1])
192  {
193  if (was_s_typ)
194  {
195  sa[endpointer--] = i + 1;
196  was_s_typ = 0;
197  }
198  }
199  else if (text[text_offset + i] < text[text_offset + i + 1])
200  {
201  was_s_typ = 1;
202  }
203  }
204 
205  // Sort S*-positions in correct order into SA
206  for (size_t i = 0; i < number_of_lms_strings; ++i)
207  {
208  size_t pos = sa[i];
209  sa[i] = sa[n - number_of_lms_strings + pos];
210  sa[n - number_of_lms_strings + pos] = 0;
211  }
212  }
213  else
214  {
215  // Move s*-Positions to front
216  sa[0] = n - 1;
217  for (size_t i = 1; i < number_of_lms_strings; ++i)
218  {
219  sa[i] = sa[n - number_of_lms_strings + i];
220  sa[n - number_of_lms_strings + i] = 0;
221  }
222  // Clear lex. names
223  for (size_t i = number_of_lms_strings; i < (n >> 1); ++i) { sa[i] = 0; }
224  }
225 
226  // Phase 3
227  {
228  // Step 10 - Count characters into c array
229 
230  // Step 11 - Calculate End-Pointer of Buckets
231  int_vector_buffer<> c_array(filename_c_array, std::ios::in, buffersize, 64);
232  std::vector<uint64_t> bkt(sigma, 0);
233  for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
234 
235  // Step 12 - Move S*-positions in correct order into SA
236  for (size_t i = number_of_lms_strings - 1; i < n; --i)
237  {
238  size_t pos = sa[i];
239  sa[i] = 0;
240  sa[bkt[text[text_offset + pos]]--] = pos;
241  }
242 
243  // Step 13 - Calculate Begin-Pointer of Buckets
244  bkt[0] = 0;
245  for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c - 1]; }
246 
247  // Step 14 - Scan from Left-To-Right to induce L-Types
248  for (size_t i = 0; i < n; ++i)
249  {
250  if (sa[i] > 0 and text[text_offset + sa[i]] <= text[text_offset + sa[i] - 1])
251  { // faster than if(sa[i]>0 and bkt_beg[text[ sa[i]-1 ]] > i)
252  sa[bkt[text[text_offset + sa[i] - 1]]++] = sa[i] - 1;
253  }
254  }
255 
256  // Step 15 - Scan from Right-To-Left to induce S-Types
257  bkt[0] = 0;
258  for (size_t c = 1; c < sigma; ++c) { bkt[c] = bkt[c - 1] + c_array[c]; }
259  for (size_t i = n - 1; i < n; --i)
260  {
261  if (sa[i] > 0 and text[text_offset + sa[i] - 1] <= text[text_offset + sa[i]])
262  {
263  sa[bkt[text[text_offset + sa[i] - 1]]--] = sa[i] - 1;
264  }
265  }
266  c_array.close(true);
267  }
268 }
269 
270 template <class int_vector_type>
271 void _construct_sa_se(int_vector_type & text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
272 {
273  std::string filename_text = tmp_file(filename_sa, "_text_rec" + util::to_string(recursion));
274  store_to_file(text, filename_text);
275  uint64_t n = text.size();
276  uint64_t nsize = bits::hi(n) + 1;
277  uint8_t int_width = bits::hi(n - 1) + 1;
278  uint64_t buffersize = 1024 * 1024 / 8;
279 
280  // Step 1 - Scan Text from right to left and count LMS, S and L characters and store lms_positions
281 
282  // Define variables
283  size_t first_lms_pos = 0;
284  size_t number_of_lms_strings = 0;
285  size_t bkt_s_last = 0, bkt_s_sum = 0, bound_s = 0, bkt_l_sum = 0;
286  int_vector<> C(sigma, 0, int_width);
287  int_vector<> bkt_lms(sigma, 0, int_width);
288  int_vector<> bkt_s(sigma, 0, int_width);
289  int_vector<> bkt_l(sigma, 0, int_width);
290  std::string filename_lms_pos_b = tmp_file(filename_sa, "_lms_pos_b" + util::to_string(recursion));
291  size_t parts = 10;
292 
293  {
294  int_vector_buffer<1> lms_pos_b(filename_lms_pos_b, std::ios::out, buffersize, 1);
295  uint64_t ci = text[n - 1];
296  ++C[ci];
297  bool was_s_typ = 1;
298  for (size_t i = n - 2; i < n; --i)
299  {
300  uint64_t cip1 = ci;
301  ci = text[i];
302  ++C[ci];
303  if (was_s_typ)
304  {
305  ++bkt_s[text[i + 1]];
306  if (ci > cip1)
307  {
308  ++bkt_lms[cip1];
309  lms_pos_b[i + 1] = 1;
310  ++number_of_lms_strings;
311  first_lms_pos = i + 1;
312  was_s_typ = 0;
313  }
314  }
315  else if (ci < cip1)
316  {
317  was_s_typ = 1;
318  }
319  }
320  if (was_s_typ) { ++bkt_s[ci]; }
321  bkt_l[0] = C[0] - bkt_s[0];
322  for (size_t i = 1; i < C.size(); ++i)
323  {
324  bkt_l[i] = C[i] - bkt_s[i];
325  C[i] = C[i] + C[i - 1];
326  }
327  lms_pos_b.close();
328  }
329 
330  // Step 2 - Scan Text from right to left and detect LMS-Positions. Sort and write them to disk
331  int_vector_buffer<> right(tmp_file(filename_sa, "_right" + util::to_string(recursion)),
332  std::ios::out,
333  buffersize,
334  nsize);
335  size_t right_pointer = 0;
336  int_vector_buffer<> left(tmp_file(filename_sa, "_left" + util::to_string(recursion)),
337  std::ios::out,
338  buffersize,
339  nsize);
340  size_t left_pointer = 0;
341  {
342  for (size_t i = 0, tmp2 = 0, tmp = 0; i < sigma; ++i)
343  {
344  tmp += bkt_lms[i];
345  bkt_lms[i] = tmp2;
346  tmp2 = tmp;
347  }
348  int_vector_buffer<> lms_positions(tmp_file(filename_sa, "_lms_positions" + util::to_string(recursion)),
349  std::ios::out,
350  buffersize,
351  nsize);
352  for (size_t i = n - 2, was_s_typ = 1, ci = text[n - 1]; i < n; --i)
353  {
354  uint64_t cip1 = ci;
355  ci = text[i];
356  if (ci > cip1)
357  {
358  if (was_s_typ)
359  {
360  lms_positions.push_back(bkt_lms[cip1]);
361  lms_positions.push_back(i + 1);
362  ++bkt_lms[cip1];
363  was_s_typ = 0;
364  }
365  }
366  else if (ci < cip1)
367  {
368  was_s_typ = 1;
369  }
370  }
371  util::clear(text);
372  {
373  // Order lms_positions according to first character
374  int_vector<> lms_strings(number_of_lms_strings, 0, int_width);
375  for (size_t i = 0; i < lms_positions.size();)
376  {
377  size_t idx = lms_positions[i++];
378  size_t val = lms_positions[i++];
379  lms_strings[idx] = val;
380  }
381  lms_positions.close(true);
382  // Store it to file
383  left_pointer = 0;
384  for (size_t i = 0; i < number_of_lms_strings; ++i)
385  {
386  left[left_pointer++] = lms_strings[number_of_lms_strings - i - 1];
387  }
388  }
389  load_from_file(text, filename_text);
390  }
391  left_pointer--;
392 
393  // Step 3 - Scan virtual array from left to right
394  {
395  // prepare bkt_l and backup it into bkt_lms
396  for (size_t i = 0, tmp = 0; i < sigma; ++i)
397  {
398  tmp = bkt_l[i];
399  bkt_l[i] = bkt_l_sum;
400  bkt_l_sum += tmp;
401  bkt_lms[i] = bkt_l[i];
402  }
403 
404  size_t partsize = bkt_l_sum / parts + 1;
405 
406  int_vector<> array(partsize, 0, int_width);
407  std::vector<int_vector_buffer<>> cached_array(parts - 1);
408  for (size_t i = 0; i < cached_array.size(); ++i)
409  {
410  cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
411  "_rightbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
412  std::ios::out,
413  buffersize,
414  nsize);
415  }
416 
417  for (size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
418  {
419  // begin with array
420  for (; pos < bkt_l[c]; ++pos)
421  {
422  // Load lazy values
423  if (pos - offset >= partsize)
424  {
425  offset += partsize;
426  for (size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
427  {
428  size_t src = cached_array[cur_part][i++];
429  size_t val = cached_array[cur_part][i++];
430  array[src - offset] = val;
431  }
432  cached_array[pos / partsize - 1].reset();
433  }
434 
435  size_t idx = array[pos - offset];
436  if (idx == 0) { right[right_pointer++] = idx; }
437  else
438  {
439  size_t symbol = text[idx - 1];
440  if (symbol >= c)
441  {
442  size_t val = idx - 1;
443  size_t src = bkt_l[symbol];
444  bkt_l[symbol] = bkt_l[symbol] + 1;
445  if ((src - offset) / partsize == 0) { array[src - offset] = val; }
446  else
447  {
448  size_t part = src / partsize - 1;
449  cached_array[part].push_back(src);
450  cached_array[part].push_back(val);
451  }
452  }
453  else
454  {
455  right[right_pointer++] = idx;
456  }
457  }
458  }
459  // continue with stack
460  while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
461  {
462  size_t idx = left[left_pointer--];
463  --idx;
464  size_t symbol = text[idx];
465 
466  size_t val = idx;
467  size_t src = bkt_l[symbol];
468  bkt_l[symbol] = bkt_l[symbol] + 1;
469  if ((src - offset) / partsize == 0) { array[src - offset] = val; }
470  else
471  {
472  size_t part = src / partsize - 1;
473  cached_array[part].push_back(src);
474  cached_array[part].push_back(val);
475  }
476  }
477  }
478  for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
479 
480  // Restore bkt_l from bkt_lms
481  for (size_t i = 0; i < sigma; ++i) { bkt_l[i] = bkt_lms[i]; }
482  }
483  right_pointer--;
484 
485  // Step 4 - Scan virtual array from right to left
486  left_pointer = 0;
487  left.reset();
488  {
489  // Prepare bkt_s and backup it into bkt_lms
490  bkt_s_last = 0, bkt_s_sum = 0;
491  for (size_t i = 0; i < sigma; ++i)
492  {
493  bkt_s_sum += bkt_s[i];
494  if (bkt_s[i])
495  {
496  bkt_s[i] = bkt_s_sum;
497  bkt_s_last = bkt_s_sum;
498  }
499  else
500  {
501  bkt_s[i] = bkt_s_sum;
502  }
503  bkt_lms[i] = bkt_s[i];
504  }
505  bound_s = bkt_s_sum;
506 
507  // Determine splitting parameters
508  for (size_t i = 0; i < sigma; ++i)
509  {
510  if (bkt_s[i] > bkt_s_sum / 2)
511  {
512  bkt_s_sum = bkt_s[i];
513  break;
514  }
515  }
516 
517  size_t partsize = bound_s / parts + 1;
518  int_vector<> array(partsize, 0, int_width);
519  std::vector<int_vector_buffer<>> cached_array(parts - 1);
520  for (size_t i = 0; i < cached_array.size(); ++i)
521  {
522  cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
523  "_leftbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
524  std::ios::out,
525  buffersize,
526  nsize);
527  }
528  for (size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
529  {
530  // begin with array
531  for (; pos + 1 > bkt_s[c]; --pos)
532  {
533  while (pos < offset)
534  {
535  // Load buffered values
536  offset -= partsize;
537  for (size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
538  {
539  size_t src = cached_array[cur_part][i++];
540  size_t val = cached_array[cur_part][i++];
541  array[src - offset] = val;
542  }
543  cached_array[offset / partsize].reset();
544  }
545 
546  size_t idx = array[pos - offset];
547  if (idx == 0) { idx = n; }
548  --idx;
549  size_t symbol = text[idx];
550  if (symbol <= c)
551  {
552  bkt_s[symbol] = bkt_s[symbol] - 1;
553  size_t val = idx;
554  size_t src = bkt_s[symbol];
555  if (src >= offset) { array[src - offset] = val; }
556  else
557  {
558  size_t part = src / partsize;
559  cached_array[part].push_back(src);
560  cached_array[part].push_back(val);
561  }
562  }
563  else
564  {
565  left[left_pointer++] = array[pos - offset];
566  }
567  }
568 
569  // continue with stack
570  while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
571  {
572  size_t idx = right[right_pointer--];
573  if (idx == 0) { idx = n; }
574  --idx;
575  size_t symbol = text[idx];
576  bkt_s[symbol] = bkt_s[symbol] - 1;
577 
578  size_t val = idx;
579  size_t src = bkt_s[symbol];
580  if (src >= offset) { array[src - offset] = val; }
581  else
582  {
583  size_t part = src / partsize;
584  cached_array[part].push_back(src);
585  cached_array[part].push_back(val);
586  }
587  }
588  }
589  for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
590  // Restore bkt_s from bkt_lms
591  for (size_t i = 0; i < sigma; ++i) { bkt_s[i] = bkt_lms[i]; }
592  }
593  right.buffersize(0);
594  right.reset();
595  right_pointer = 0;
596  --left_pointer;
597 
598  // Step 5 - Detect same lms-Strings, write text to file
599  int_vector<1> same_lms(number_of_lms_strings, false);
600  size_t last_end_pos = first_lms_pos, order = number_of_lms_strings - 1;
601  same_lms[number_of_lms_strings - 1] = true;
602  for (size_t i = number_of_lms_strings - 2, a = 0, b = 0, last_a = left[number_of_lms_strings - 1];
603  i < number_of_lms_strings;
604  --i)
605  {
606  b = last_a;
607  a = left[i];
608  last_a = a;
609 
610  size_t end_pos = _get_next_lms_position(text, a);
611  if (end_pos - a == last_end_pos - b)
612  {
613  while (a < end_pos and text[a] == text[b])
614  {
615  ++a;
616  ++b;
617  }
618  if (text[a] == text[b])
619  {
620  same_lms[i] = true;
621  --order;
622  }
623  }
624  last_end_pos = end_pos;
625  }
626  util::clear(text);
627 
628  // Step 7 - Create renamed string
629  int_vector<> text_rec;
630  if (recursion == 0) { text_rec.width((bits::hi(order + 1) + 1)); }
631  else
632  {
633  text_rec.width((bits::hi(number_of_lms_strings + 1) + 1));
634  }
635  text_rec.resize(number_of_lms_strings);
636  util::_set_zero_bits(text_rec);
637  {
638  if (recursion == 0 and n / 2 * text_rec.width() > 8 * n)
639  {
640  size_t size_of_part = n / 4 + 3;
641  text_rec.resize(size_of_part);
642  util::_set_zero_bits(text_rec);
643  order = 0;
644  for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
645  {
646  if (!same_lms[i]) { ++order; }
647  if (left[i] / 2 >= size_of_part) { text_rec[(left[i] / 2) - size_of_part] = order; }
648  }
649  std::string filename_text_rec_part2 = tmp_file(filename_sa, "_text_rec_part2" + util::to_string(recursion));
650  size_t pos = 0;
651  for (size_t i = 0; i < size_of_part; ++i)
652  {
653  if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
654  }
655  text_rec.resize(pos);
656  store_to_file(text_rec, filename_text_rec_part2);
657  text_rec.resize(size_of_part);
658  util::_set_zero_bits(text_rec);
659  order = 0;
660  for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
661  {
662  if (!same_lms[i]) { ++order; }
663  if (left[i] / 2 < size_of_part) { text_rec[left[i] / 2] = order; }
664  }
665  pos = 0;
666  for (size_t i = 0; i < size_of_part; ++i)
667  {
668  if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
669  }
670  text_rec.resize(number_of_lms_strings);
671  int_vector_buffer<> buf(filename_text_rec_part2, std::ios::in, 1024 * 1024);
672  for (size_t i = 0; i < buf.size(); ++i) { text_rec[pos++] = buf[i]; }
673  buf.close(true);
674  text_rec[number_of_lms_strings - 1] = 0;
675  }
676  else
677  {
678  text_rec.resize(n / 2 + 1);
679  util::_set_zero_bits(text_rec);
680  order = 0;
681  for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
682  {
683  if (!same_lms[i]) { ++order; }
684  text_rec[left[left_pointer--] / 2] = order;
685  }
686  for (size_t i = 0, pos = 0; i < text_rec.size(); ++i)
687  {
688  if (text_rec[i] > 0) { text_rec[pos++] = text_rec[i]; }
689  }
690  text_rec[number_of_lms_strings - 1] = 0;
691  text_rec.resize(number_of_lms_strings);
692  }
693  }
694  util::clear(same_lms);
695  left.buffersize(0);
696  left.reset();
697 
698  // Step 8 - Determine complete LMS order (recursivly)
699  int_vector<> isa_rec;
700  std::string filename_sa_rec = tmp_file(filename_sa, "_sa_rec" + util::to_string(recursion + 1));
701  if (text_rec.size() > order + 1)
702  {
703  if (recursion == 0)
704  {
705  memory_monitor::event("begin _construct_sa");
706  _construct_sa_se<int_vector<>>(text_rec, filename_sa_rec, order + 1, recursion + 1);
707  memory_monitor::event("end _construct_sa");
708  }
709  else
710  {
711  text_rec.resize(text_rec.size() * 2);
712  for (size_t i = 0; i < number_of_lms_strings; ++i)
713  {
714  text_rec[number_of_lms_strings + i] = text_rec[i];
715  text_rec[i] = 0;
716  }
717  memory_monitor::event("begin sa_simple");
718  _construct_sa_IS(text_rec,
719  text_rec,
720  filename_sa_rec,
721  number_of_lms_strings,
722  number_of_lms_strings,
723  order + 1,
724  recursion + 1);
725  memory_monitor::event("end sa_simple");
726  // SA' in first half, S' in second half
727  text_rec.resize(number_of_lms_strings);
728  store_to_file(text_rec, filename_sa_rec);
729  }
730  }
731  else
732  {
733  isa_rec = std::move(text_rec);
734  }
735 
736  // Step 9 - Initiate left for scan in step 12
737  if (isa_rec.size() > 0)
738  {
739  // isa_rec exists //TODO test if its better to create sa_rec
740  // TODO always enough memory? in memory: isa_rec, lms_pos_b, select_support, tmp_left, leftbuffer
741  // load bit_vector lms_positions and create select support
742  bit_vector lms_pos_b(n);
743  load_from_file(lms_pos_b, filename_lms_pos_b);
744  sdsl::remove(filename_lms_pos_b);
745  select_support_mcl<> lms_select_support; // select_support for bit_vector
746  util::init_support(lms_select_support, &lms_pos_b); // Create select_support
747  // write left
748  int_vector<> tmp_left(number_of_lms_strings, 0, int_width);
749  for (size_t i = number_of_lms_strings - 1; i < number_of_lms_strings; --i)
750  {
751  size_t idx = isa_rec[i];
752  size_t val = lms_select_support.select(i + 1); // TODO test alternative without select support: look for 1
753  // in lms_pos_b (backwards)
754  tmp_left[idx] = val;
755  }
756  util::clear(lms_select_support);
757  util::clear(lms_pos_b);
758  util::clear(isa_rec);
759  // write to left
760  left.buffersize(buffersize);
761  left_pointer = 0;
762  for (; left_pointer < number_of_lms_strings; ++left_pointer)
763  {
764  left[left_pointer] = tmp_left[number_of_lms_strings - left_pointer - 1];
765  }
766  left_pointer--;
767  util::clear(tmp_left);
768  }
769  else
770  {
771  left.buffersize(buffersize);
772  left_pointer = 0;
773  {
774  // load bit_vector lms_positions and create select support
775  bit_vector lms_pos_b(n);
776  load_from_file(lms_pos_b, filename_lms_pos_b);
777  sdsl::remove(filename_lms_pos_b);
778  select_support_mcl<> lms_select_support; // select_support for bit_vector
779  util::init_support(lms_select_support, &lms_pos_b); // create select_support
780  // write to left sa_rec buffered
781  int_vector_buffer<> sa_rec_buf(filename_sa_rec, std::ios::in, buffersize, nsize);
782  for (uint64_t i = 0; i < sa_rec_buf.size(); ++i)
783  {
784  uint64_t pos = lms_select_support.select(sa_rec_buf[i] + 1);
785  left[number_of_lms_strings - 1 - left_pointer++] = pos;
786  }
787  sa_rec_buf.close(true);
788  left_pointer--;
789  }
790  // TODO test sa_rec unbuffered in recursion level 1 -> space still good?
791  }
792 
793  // Step 10 - Reload text
794  load_from_file(text, filename_text);
795  sdsl::remove(filename_text);
796 
797  // Step 12 - Scan virtual array from left to right second time
798  right.buffersize(buffersize);
799  right_pointer = 0;
800  int_vector_buffer<> cached_sa(filename_sa, std::ios::out, buffersize, nsize);
801  size_t sa_pointer = 0;
802  {
803  size_t partsize = bkt_l_sum / parts + 1;
804  int_vector<> array(partsize, 0, int_width);
805  std::vector<int_vector_buffer<>> cached_array(parts - 1);
806  for (size_t i = 0; i < cached_array.size(); ++i)
807  {
808  cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
809  "_rightbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
810  std::ios::out,
811  buffersize,
812  nsize);
813  }
814 
815  for (size_t c = 0, pos = 0, offset = 0; c < sigma; ++c)
816  {
817  // begin with array
818  for (; pos < bkt_l[c]; ++pos)
819  {
820  // Load lazy values
821  if (pos - offset >= partsize)
822  {
823  offset += partsize;
824  for (size_t i = 0, cur_part = pos / partsize - 1; i < cached_array[cur_part].size();)
825  {
826  size_t src = cached_array[cur_part][i++];
827  size_t val = cached_array[cur_part][i++];
828  array[src - offset] = val;
829  }
830  cached_array[pos / partsize - 1].reset();
831  }
832  size_t idx = array[pos - offset];
833  if (idx == 0)
834  {
835  cached_sa[sa_pointer++] = idx;
836  right[right_pointer++] = idx;
837  }
838  else
839  {
840  size_t symbol = text[idx - 1];
841  cached_sa[sa_pointer++] = idx;
842  if (symbol >= c)
843  {
844  size_t val = idx - 1;
845  size_t src = bkt_l[symbol];
846  bkt_l[symbol] = bkt_l[symbol] + 1;
847  if ((src - offset) / partsize == 0) { array[src - offset] = val; }
848  else
849  {
850  size_t part = src / partsize - 1;
851  cached_array[part].push_back(src);
852  cached_array[part].push_back(val);
853  }
854  }
855  else
856  {
857  right[right_pointer++] = idx;
858  }
859  }
860  }
861  sa_pointer = C[c];
862  // continue with stack
863  while (left_pointer < number_of_lms_strings and text[left[left_pointer]] == c)
864  {
865  size_t idx = left[left_pointer--];
866  if (idx == 0) { idx = n; }
867  --idx;
868  size_t symbol = text[idx];
869  size_t val = idx;
870  size_t src = bkt_l[symbol];
871  bkt_l[symbol] = bkt_l[symbol] + 1;
872  if ((src - offset) / partsize == 0) { array[src - offset] = val; }
873  else
874  {
875  size_t part = src / partsize - 1;
876  cached_array[part].push_back(src);
877  cached_array[part].push_back(val);
878  }
879  }
880  }
881  for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
882  }
883  left.close(true);
884  right_pointer--;
885 
886  // Step 13 - Scan virtual array from right to left second time
887  {
888  size_t partsize = bound_s / parts + 1;
889 
890  int_vector<> array(partsize, 0, int_width);
891  std::vector<int_vector_buffer<>> cached_array(parts - 1);
892  for (size_t i = 0; i < cached_array.size(); ++i)
893  {
894  cached_array[i] = int_vector_buffer<>(tmp_file(filename_sa,
895  "_leftbuffer" + util::to_string(i) + "_" + util::to_string(recursion)),
896  std::ios::out,
897  buffersize,
898  nsize);
899  // cached_array_pointer[i] = 0;
900  }
901  for (size_t c = sigma - 1, pos = bkt_s_last - 1, offset = partsize * (parts - 1); c < sigma; --c)
902  {
903  // begin with array
904  assert(c < C.size());
905  sa_pointer = C[c] - 1;
906  for (; pos + 1 > bkt_s[c]; --pos)
907  {
908  while (pos < offset)
909  {
910  // Load buffered values
911  offset -= partsize;
912  for (size_t i = 0, cur_part = offset / partsize; i < cached_array[cur_part].size();)
913  {
914  size_t src = cached_array[cur_part][i++];
915  size_t val = cached_array[cur_part][i++];
916  assert((src - offset) < array.size());
917  array[src - offset] = val;
918  }
919  assert((offset / partsize) < parts - 1);
920  cached_array[offset / partsize].reset();
921  }
922 
923  assert((pos - offset) < array.size());
924  size_t idx = array[pos - offset];
925  if (idx == 0) { idx = n; }
926  --idx;
927  assert((idx) < text.size());
928  size_t symbol = text[idx];
929  if (symbol <= c)
930  {
931  if (idx == n - 1) { cached_sa[sa_pointer--] = 0; }
932  else
933  {
934  cached_sa[sa_pointer--] = idx + 1;
935  }
936  assert((symbol) < bkt_s.size());
937  bkt_s[symbol] = bkt_s[symbol] - 1;
938  size_t val = idx;
939  size_t src = bkt_s[symbol];
940  if (src >= offset)
941  {
942  assert((src - offset) < array.size());
943  array[src - offset] = val;
944  }
945  else
946  {
947  size_t part = src / partsize;
948  assert(part < parts - 1);
949  cached_array[part].push_back(src);
950  cached_array[part].push_back(val);
951  }
952  }
953  else
954  {
955  if (idx == n - 1) { cached_sa[sa_pointer--] = 0; }
956  else
957  {
958  cached_sa[sa_pointer--] = idx + 1;
959  }
960  }
961  }
962  // continue with stack
963  while (right_pointer < number_of_lms_strings and text[right[right_pointer]] == c)
964  {
965  size_t idx = right[right_pointer--];
966  if (idx == 0) { idx = n; }
967  --idx;
968  size_t symbol = text[idx];
969  assert((symbol) < bkt_s.size());
970  bkt_s[symbol] = bkt_s[symbol] - 1;
971 
972  size_t val = idx;
973  size_t src = bkt_s[symbol];
974  if (src >= offset)
975  {
976  assert((src - offset) < array.size());
977  array[src - offset] = val;
978  }
979  else
980  {
981  size_t part = src / partsize;
982  assert((part) < parts - 1);
983  cached_array[part].push_back(src);
984  cached_array[part].push_back(val);
985  }
986  }
987  }
988  for (size_t i = 0; i < cached_array.size(); ++i) { cached_array[i].close(true); }
989  }
990  right.close(true);
991  cached_sa.close();
992 
993  return;
994 }
995 
996 } // namespace sdsl
997 
998 #endif
uint64_t size() const
Returns the number of elements currently stored.
void reset()
Delete all content and set size to 0.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:619
size_type size() const noexcept
The number of elements in the int_vector.
void push_back(value_type value)
Insert element at the end.
Definition: int_vector.hpp:472
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:545
static mm_event_proxy event(const std::string &name)
A class supporting constant time select queries.
size_type select(size_type i) const
Select function.
int_vector.hpp contains the sdsl::int_vector class.
io.hpp contains some methods for reading/writing sdsl structures.
std::string to_string(const T &t, int w=1)
void _set_zero_bits(t_int_vec &v)
Sets all bits of the int_vector to 0-bits.
Definition: util.hpp:534
Namespace for the succinct data structure library.
void _construct_sa_IS(int_vector<> &text, int_vector<> &sa, std::string &filename_sa, size_t n, size_t text_offset, size_t sigma, uint64_t recursion)
uint64_t _get_next_lms_position(int_vector_type &text, uint64_t i)
void _construct_sa_se(int_vector_type &text, std::string filename_sa, uint64_t sigma, uint64_t recursion)
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition: io.hpp:698
bool load_from_file(T &v, const std::string &file)
Load sdsl-object v from a file.
Definition: io.hpp:901
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661