SDSL  3.0.0
Succinct Data Structure Library
qsufsort.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
4 /* qsufsort.c
5  * Copyright 1999, N. Jesper Larsson, all rights reserved.
6  *
7  * This file contains an implementation of the algorithm presented in "Faster
8  * Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko
9  * Sadakane (sada@is.s.u-tokyo.ac.jp).
10  *
11  * This software may be used freely for any purpose. However, when distributed,
12  * the original source must be clearly stated, and, when the source code is
13  * distributed, the copyright notice must be retained and any alterations in
14  * the code must be clearly marked. No warranty is given regarding the quality
15  of this software.*/
23 #ifndef INCLUDED_SDSL_QSUFSORT
24 #define INCLUDED_SDSL_QSUFSORT
25 
26 #define DBG_OUT \
27  if (0) std::cout
28 
29 #include <sdsl/int_vector.hpp>
30 
31 namespace sdsl
32 {
33 namespace qsufsort
34 {
35 
36 template <class int_vector_type = int_vector<>>
37 class sorter;
38 
39 // void sort(int_iter text, int_iter sa, int64_t n, int64_t k, int64_t l);
40 
42 
58 // TODO: problem when int_width==64!!!
59 template <class int_vector_type>
60 void construct_sa(int_vector_type & sa, const char * file, uint8_t num_bytes)
61 {
63  s.sort(sa, file, num_bytes);
64 }
65 
66 template <class int_vector_type, class t_vec>
67 void construct_sa(int_vector_type & sa, t_vec & text)
68 {
70  s.sort(sa, text);
71 }
72 
73 template <class int_vector_type>
74 class sorter
75 {
76  typedef int_vector_type tIV;
77  typedef typename tIV::iterator int_iter;
78  typedef typename tIV::size_type size_type;
79 
80  private:
81  int_iter m_SA, // group array, ultimately suffix array.
82  m_VV; // inverse array, ultimately inverse of SA.
83  uint64_t m_rr, // number of symbols aggregated by transform.
84  m_hh; // length of already-sorted prefixes.
85  uint8_t m_msb; // most significant bit position starting from 0
86  uint64_t m_msb_mask; // mask for 1ULL<<msb
87 
88  inline int64_t to_sign(uint64_t x) const { return x & m_msb_mask ? -((int64_t)(x & ~m_msb_mask)) : x; }
89  // return the absolute value of integer x
90  inline int64_t mark_pos(uint64_t x) const { return (x & ~m_msb_mask); }
91  // mark the number x as negative
92  inline int64_t mark_neg(uint64_t x) const { return x | m_msb_mask; }
93  // check if x is not negative
94  inline bool not_neg(uint64_t x) const { return !(x >> m_msb); }
95  // check if x is negative
96  inline bool is_neg(uint64_t x) const { return x & m_msb_mask; }
97  // returns the key of iterator p at the current sorting depth
98  inline uint64_t key(const int_iter & p) const { return m_VV[*p + m_hh]; }
99  // swap the value of two iterators
100  inline void swap(int_iter & p, int_iter & q) const
101  {
102  uint64_t tmp = *p;
103  *p = *q;
104  *q = tmp;
105  }
106  // select the median out of 3 elements
107  inline const int_iter & med3(const int_iter & a, const int_iter & b, const int_iter & c) const
108  {
109  return key(a) < key(b) ? (key(b) < key(c) ? b : (key(a) < key(c) ? c : a))
110  : (key(b) > key(c) ? b : (key(a) > key(c) ? c : a));
111  }
112 
113  /* Subroutine for select_sort_split and sort_split. Sets group numbers for a
114  group whose lowest position in m_SA is pl and highest position is pm.*/
115  void update_group(int_iter pl, int_iter pm)
116  {
117  int64_t g = pm - m_SA; /* group number.*/
118  m_VV[*pl] = g; /* update group number of first position.*/
119  if (pl == pm)
120  *pl = mark_neg(1); /* one element, sorted group.*/
121  else
122  do /* more than one element, unsorted group.*/
123  m_VV[*++pl] = g; /* update group numbers.*/
124  while (pl < pm);
125  }
126 
127  /* Quadratic sorting method to use for small subarrays. To be able to update
128  group numbers consistently, a variant of selection sorting is used.*/
129  void select_sort_split(const int_iter & p, int64_t n)
130  {
131  int_iter pa, pb, pi, pn;
132  uint64_t f, v;
133 
134  pa = p; /* pa is start of group being picked out.*/
135  pn = p + n - 1; /* pn is last position of subarray.*/
136  while (pa < pn)
137  {
138  for (pi = pb = (pa + 1), f = key(pa); pi <= pn; ++pi)
139  if ((v = key(pi)) < f)
140  {
141  f = v; /* f is smallest key found.*/
142  swap(pi, pa); /* place smallest element at beginning.*/
143  pb = pa + 1; /* pb is position for elements equal to f.*/
144  }
145  else if (v == f)
146  { /* if equal to smallest key.*/
147  swap(pi, pb); /* place next to other smallest elements.*/
148  ++pb;
149  }
150  update_group(pa, pb - 1); /* update group values for new group.*/
151  pa = pb; /* continue sorting rest of the subarray.*/
152  }
153  if (pa == pn)
154  { /* check if last part is single element.*/
155  m_VV[*pa] = pa - m_SA;
156  *pa = mark_neg(1); /* sorted group.*/
157  }
158  }
159 
160  /* Subroutine for sort_split, algorithm by Bentley & McIlroy.*/
161  uint64_t choose_pivot(const int_iter & p, int64_t n)
162  {
163  int_iter pl, pm, pn;
164  int64_t s;
165 
166  pm = p + (n >> 1); /* small arrays, middle element.*/
167  if (n > 7LL)
168  {
169  pl = p;
170  pn = p + n - 1;
171  if (n > 40LL)
172  { /* big arrays, pseudomedian of 9.*/
173  s = n >> 3;
174  pl = med3(pl, pl + s, pl + s + s);
175  pm = med3(pm - s, pm, pm + s);
176  pn = med3(pn - s - s, pn - s, pn);
177  }
178  pm = med3(pl, pm, pn); /* midsize arrays, median of 3.*/
179  }
180  return key(pm);
181  }
182 
183  /* Sorting routine called for each unsorted group. Sorts the array of integers
184  * (suffix numbers) of length n starting at p. The algorithm is a ternary-split
185  * quicksort taken from Bentley & McIlroy, "Engineering a Sort Function",
186  * Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This
187  function is based on Program 7.*/
188  void sort_split(const int_iter & p, int64_t n)
189  {
190  int_iter pa, pb, pc, pd, pl, pm, pn;
191  uint64_t f, v;
192  int64_t s, t;
193 
194  if (n < 7)
195  { /* multi-selection sort smallest arrays.*/
196  select_sort_split(p, n);
197  return;
198  }
199 
200  v = choose_pivot(p, n);
201  pa = pb = p; // pa: iterator for equal elements
202  pc = pd = p + n - 1; // pc = right border (inclusive)
203  while (1)
204  { /* split-end partition.*/
205  while (pb <= pc && (f = key(pb)) <= v)
206  { // go to the right as long as there are keys smaller equal than v
207  if (f == v)
208  {
209  swap(pa, pb);
210  ++pa; // swap equal chars to the left
211  }
212  ++pb;
213  }
214  while (pc >= pb && (f = key(pc)) >= v)
215  { // go to the left as long as there are keys bigger or equal to v
216  if (f == v)
217  {
218  swap(pc, pd);
219  --pd; // swap equal chars to the right end
220  }
221  --pc;
222  }
223  if (pb > pc) break;
224  swap(pb,
225  pc); // swap element > v (pb) to the third part and element < v (pc) to the second
226  ++pb;
227  --pc;
228  }
229  pn = p + n;
230  if ((s = pa - p) > (t = pb - pa)) s = t;
231  for (pl = p, pm = pb - s; s; --s, ++pl, ++pm) swap(pl, pm);
232  if ((s = pd - pc) > (t = pn - pd - 1)) s = t;
233  for (pl = pb, pm = pn - s; s; --s, ++pl, ++pm) swap(pl, pm);
234  s = pb - pa;
235  t = pd - pc;
236  if (pa > pb)
237  {
238  if (s > 0) { std::cout << "s=" << s << ">0 but should be <0; n=" << n << std::endl; }
239  }
240  if (pc > pd)
241  {
242  if (t > 0) { std::cout << "t=" << t << ">0 but should be <0; n=" << n << std::endl; }
243  }
244  if (s > 0) sort_split(p, s);
245  update_group(p + s, p + n - t - 1);
246  if (t > 0) sort_split(p + n - t, t);
247  }
248 
249  /* Bucketsort for first iteration.
250  *
251  * Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear
252  * at least once. x[n] is 0. (This is the corresponding output of transform.) k
253  * must be at most n+1. p is array of size n+1 whose contents are disregarded.
254  *
255  * Output: x is m_VV and p is m_SA after the initial sorting stage of the refined
256  suffix sorting algorithm.*/
257  void bucketsort(const int_iter & x, const int_iter & p, int64_t n, int64_t k)
258  {
259  int_iter pi;
260  int64_t i, d, g;
261  uint64_t c;
262 
263  for (pi = p; pi < p + k; ++pi) *pi = mark_neg(1); /* mark linked lists empty.*/
264  for (i = 0; i <= n; ++i)
265  {
266  x[i] = p[c = x[i]]; /* insert in linked list.*/
267  p[c] = i;
268  }
269  for (pi = p + k - 1, i = n; pi >= p; --pi)
270  {
271  d = x[c = *pi]; /* c is position, d is next in list.*/
272  x[c] = g = i; /* last position equals group number.*/
273  if (not_neg(d))
274  { /* if more than one element in group.*/
275  p[i--] = c; /* p is permutation for the sorted x.*/
276  do {
277  d = x[c = d]; /* next in linked list.*/
278  x[c] = g; /* group number in x.*/
279  p[i--] = c; /* permutation in p.*/
280  } while (not_neg(d));
281  }
282  else
283  p[i--] = mark_neg(1); /* one element, sorted group.*/
284  }
285  }
286 
287  public:
288  /* Transforms the alphabet of x by attempting to aggregate several symbols into
289  * one, while preserving the suffix order of x. The alphabet may also be
290  * compacted, so that x on output comprises all integers of the new alphabet
291  * with no skipped numbers.
292  *
293  * Input: x is an array of size n+1 whose first n elements are positive
294  * integers in the range l...k-1. p is array of size n+1, used for temporary
295  * storage. q controls aggregation and compaction by defining the maximum value
296  * for any symbol during transformation: q must be at least k-l; if q<=n,
297  * compaction is guaranteed; if k-l>n, compaction is never done; if q is
298  * INT_MAX, the maximum number of symbols are aggregated into one.
299  *
300  * Output: Returns an integer j in the range 1...q representing the size of the
301  * new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is
302  set to the number of old symbols grouped into one. Only x[n] is 0.*/
303 
304  int64_t transform(const int_iter & x, const int_iter & p, int64_t n, int64_t k, int64_t l, int64_t q)
305  {
306  if (!(q >= k - l)) { std::cout << "q=" << q << " k-l=" << k - l << std::endl; }
307  assert(q >= k - l);
308  DBG_OUT << "transform(n=" << n << ", k=" << k << ", l=" << l << ", q=" << q << ")" << std::endl;
309  uint64_t bb, cc, dd;
310  int64_t jj;
311  int_iter pi, pj;
312  int s = bits::hi(k - l) + (k > l); /* s is number of bits in old symbol.*/
313  uint8_t len = 0; /* len is for overflow checking.*/
314  m_rr = 0;
315  for (bb = dd = 0; (int)m_rr < n && (int)len < m_msb + 1 - s && (int64_t)(cc = dd << s | (k - l)) <= q;
316  ++m_rr, len += s)
317  {
318  bb = bb << s | (x[m_rr] - l + 1); /* bb is start of x in chunk alphabet.*/
319  dd = cc; /* dd is max symbol in chunk alphabet.*/
320  }
321  DBG_OUT << "m_rr=" << m_rr << std::endl;
322  uint64_t mm = (1ULL << (m_rr - 1) * s) - 1; /* mm masks off top old symbol from chunk.*/
323  x[n] = l - 1; /* emulate zero terminator.*/
324  if ((int64_t)dd <= n)
325  { /* if bucketing possible, compact alphabet.*/
326  for (pi = p; pi <= p + dd; ++pi) *pi = 0; /* zero transformation table.*/
327  for (pi = x + m_rr, cc = bb; pi <= x + n; ++pi)
328  {
329  p[cc] = 1; /* mark used chunk symbol.*/
330  cc = (cc & mm) << s | (*pi - l + 1); /* shift in next old symbol in chunk.*/
331  }
332  for (uint64_t i = 1; i < m_rr; ++i)
333  { /* handle last r-1 positions.*/
334  p[cc] = 1; /* mark used chunk symbol.*/
335  cc = (cc & mm) << s; /* shift in next old symbol in chunk.*/
336  }
337  for (pi = p, jj = 1; pi <= p + dd; ++pi)
338  if (*pi) *pi = jj++; /* j is new alphabet size.*/
339  for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
340  {
341  *pi = p[cc]; /* transform to new alphabet.*/
342  cc = (cc & mm) << s | (*pj - l + 1); /* shift in next old symbol in chunk.*/
343  }
344  while (pi < x + n)
345  { /* handle last r-1 positions.*/
346  *pi++ = p[cc]; /* transform to new alphabet.*/
347  cc = (cc & mm) << s; /* shift right-end zero in chunk.*/
348  }
349  }
350  else
351  { /* bucketing not possible, don't compact.*/
352  for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
353  {
354  *pi = cc; /* transform to new alphabet.*/
355  cc = (cc & mm) << s | (*pj - l + 1); /* shift in next old symbol in chunk.*/
356  }
357  while (pi < x + n)
358  { /* handle last r-1 positions.*/
359  *pi++ = cc; /* transform to new alphabet.*/
360  cc = (cc & mm) << s; /* shift right-end zero in chunk.*/
361  }
362  jj = dd + 1; /* new alphabet size.*/
363  }
364  x[n] = 0; /* end-of-string symbol is zero.*/
365  DBG_OUT << "end transformation jj=" << jj << std::endl;
366  return jj; /* return new alphabet size.*/
367  }
368 
369  /* Makes suffix array p of x. x becomes inverse of p. p and x are both of size
370  * n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original
371  * contents of x[n] is disregarded, the n-th symbol being regarded as
372  end-of-string smaller than all other symbols.*/
373  void sort(const int_iter & x, const int_iter & p, int64_t n, int64_t k, int64_t l)
374  {
375  int_iter pi, pk;
376  m_VV = x; /* set global values.*/
377  m_SA = p;
378  if (n >= k - l)
379  { /* if bucketing possible,*/
380  int64_t j = transform(m_VV, m_SA, n, k, l, n);
381  DBG_OUT << "begin bucketsort j=" << j << std::endl;
382  bucketsort(m_VV, m_SA, n, j); /* bucketsort on first r positions.*/
383  DBG_OUT << "end bucketsort" << std::endl;
384  }
385  else
386  {
387  transform(m_VV, m_SA, n, k, l, m_msb_mask - 1);
388  DBG_OUT << "initialize SA begin" << std::endl;
389  for (int64_t i = 0; i <= n; ++i) m_SA[i] = i; /* initialize I with suffix numbers.*/
390  DBG_OUT << "initialize SA end" << std::endl;
391  m_hh = 0;
392  sort_split(m_SA, n + 1); /* quicksort on first r positions.*/
393  }
394  m_hh = m_rr; /* number of symbols aggregated by transform.*/
395  // while ( is_neg(*m_SA) and mark_pos(*m_SA) <= n) {
396  while (to_sign(*m_SA) >= -n)
397  {
398  // std::cout<<"m_hh="<<m_hh<<std::endl;
399  DBG_OUT << "SA = ";
400  // for(size_t iii=0; iii<=(size_t)n; ++iii){
401  // uint64_t D = *(m_SA+iii);
402  // printf("%c%lld ", is_neg(D)?'-':' ', mark_pos(D));
403  //}
404  DBG_OUT << std::endl;
405  DBG_OUT << "TEXT = ";
406  // for(size_t iii=0; iii<=(size_t)n; ++iii){
407  // uint64_t D = *(m_VV+iii);
408  // printf("%c%lld ", is_neg(D)?'-':' ', mark_pos(D));
409  //}
410  DBG_OUT << std::endl;
411  DBG_OUT << "*m_SA=" << to_sign(*m_SA) << std::endl;
412  // std::cout<<"mark_pos(*m_SA)="<<mark_pos(*m_SA)<<std::endl;
413  pi = m_SA; /* pi is first position of group.*/
414  int64_t sl = 0; /* sl is length of sorted groups.*/
415  DBG_OUT << "m_hh=" << m_hh << std::endl;
416  do {
417  uint64_t s = *pi;
418  if (to_sign(s) < (int64_t)0)
419  {
420  pi += mark_pos(s); /* skip over sorted group.*/
421  sl += mark_pos(s); /* add length to sl.*/
422  }
423  else
424  {
425  if (sl)
426  {
427  *(pi - sl) = mark_neg(sl); /* combine sorted groups before pi.*/
428  sl = 0;
429  }
430  pk = m_SA + m_VV[s] + 1; /* pk-1 is last position of unsorted group.*/
431  sort_split(pi, pk - pi);
432  pi = pk; /* next group.*/
433  }
434  } while ((pi - m_SA) <= n);
435  if (sl) /* if the array ends with a sorted group.*/
436  *(pi - sl) = mark_neg(sl); /* combine sorted groups at end of m_SA.*/
437  m_hh = 2 * m_hh; /* double sorted-depth.*/
438  DBG_OUT << "m_hh=" << m_hh << std::endl;
439  }
440  for (int64_t i = 0; i <= n; ++i)
441  { /* reconstruct suffix array from inverse.*/
442  m_SA[m_VV[i]] = i;
443  }
444  }
445 
446  void do_sort(tIV & sa, tIV & x)
447  {
448  assert(x.size() > 0);
449  DBG_OUT << "x.width()=" << (int)x.width() << std::endl;
450  DBG_OUT << "x.size()=" << x.size() << std::endl;
451  DBG_OUT << "sa.width()=" << (int)sa.width() << std::endl;
452  DBG_OUT << "sa.size()=" << sa.size() << std::endl;
453  if (x.size() == 1)
454  {
455  sa = tIV(1, 0);
456  return;
457  }
458 
459  int64_t max_symbol = 0, min_symbol = x.width() < 64 ? bits::lo_set[x.width()] : 0x7FFFFFFFFFFFFFFFLL;
460 
461  for (size_type i = 0; i < x.size() - 1; ++i)
462  {
463  max_symbol = std::max(max_symbol, (int64_t)x[i]);
464  min_symbol = std::min(min_symbol, (int64_t)x[i]);
465  }
466 
467  if (0 == min_symbol) { throw std::logic_error("Text contains 0-symbol. Suffix array can not be constructed."); }
468  if (x[x.size() - 1] > 0)
469  {
470  throw std::logic_error("Last symbol is not 0-symbol. Suffix array can not be constructed.");
471  }
472  DBG_OUT << "sorter: min_symbol=" << min_symbol << std::endl;
473  DBG_OUT << "sorter: max_symbol=" << max_symbol << std::endl;
474 
475  int64_t n = x.size() - 1;
476  DBG_OUT << "x.size()-1=" << x.size() - 1 << " n=" << n << std::endl;
477  uint8_t width = std::max(bits::hi(max_symbol) + 2, bits::hi(n + 1) + 2);
478  DBG_OUT << "sorter: width=" << (int)width << " max_symbol_width=" << bits::hi(max_symbol) + 1
479  << " n_width=" << bits::hi(n) << std::endl;
480  util::expand_width(x, width);
481  sa = x;
482  if (sa.width() < x.width())
483  {
484  throw std::logic_error("Fixed size suffix array is to small for the specified text.");
485  return;
486  }
487 
488  m_msb = sa.width() - 1;
489  m_msb_mask = 1ULL << m_msb;
490  DBG_OUT << "sorter: m_msb=" << (int)m_msb << " m_msb_mask=" << m_msb_mask << std::endl;
491  sort(x.begin(), sa.begin(), x.size() - 1, max_symbol + 1, min_symbol);
492  }
493 
494  void sort(tIV & sa, const char * file_name, uint8_t num_bytes)
495  {
496  DBG_OUT << "sorter: sort(" << file_name << ")" << std::endl;
497  DBG_OUT << "sizeof(int_vector<>::difference_type)=" << sizeof(int_vector<>::difference_type) << std::endl;
498  util::clear(sa); // free space for sa
499  tIV x;
500  if (num_bytes == 0 and typeid(typename tIV::reference) == typeid(uint64_t))
501  {
502  DBG_OUT << "sorter: use int_vector<64>" << std::endl;
503  int_vector<> temp;
504  load_vector_from_file(temp, file_name, num_bytes);
505  x.resize(temp.size());
506  for (size_type i = 0; i < temp.size(); ++i) x[i] = temp[i];
507  }
508  else
509  {
510  load_vector_from_file(x, file_name, num_bytes);
512  }
513  do_sort(sa, x);
514  }
515 
516  template <class t_vec>
517  void sort(tIV & sa, t_vec & text)
518  {
519  tIV x;
520  x.resize(text.size());
521  for (size_type i = 0; i < text.size(); ++i) x[i] = text[i];
522  do_sort(sa, x);
523  }
524 };
525 
526 } // end namespace qsufsort
527 
528 } // end namespace sdsl
529 
530 #endif
A generic vector class for integers of width .
Definition: int_vector.hpp:253
size_type size() const noexcept
The number of elements in the int_vector.
void sort(const int_iter &x, const int_iter &p, int64_t n, int64_t k, int64_t l)
Definition: qsufsort.hpp:373
void do_sort(tIV &sa, tIV &x)
Definition: qsufsort.hpp:446
void sort(tIV &sa, t_vec &text)
Definition: qsufsort.hpp:517
void sort(tIV &sa, const char *file_name, uint8_t num_bytes)
Definition: qsufsort.hpp:494
int64_t transform(const int_iter &x, const int_iter &p, int64_t n, int64_t k, int64_t l, int64_t q)
Definition: qsufsort.hpp:304
int_vector.hpp contains the sdsl::int_vector class.
int_vector ::size_type size_type
void construct_sa(int_vector_type &sa, const char *file, uint8_t num_bytes)
Construct a suffix array for the sequence stored in a file.
Definition: qsufsort.hpp:60
void expand_width(t_int_vec &v, uint8_t new_width)
Expands the integer width to new_width >= v.width()
Definition: util.hpp:512
void bit_compress(t_int_vec &v)
Bit compress the int_vector.
Definition: util.hpp:487
Namespace for the succinct data structure library.
bool load_vector_from_file(t_int_vec &v, const std::string &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
Definition: io.hpp:187
#define DBG_OUT
Definition: qsufsort.hpp:26
constexpr static uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition: bits.hpp:197
static SDSL_CONSTEXPR uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:661