SDSL  3.0.0
Succinct Data Structure Library
construct.hpp
Go to the documentation of this file.
1 // Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2 // Please see the AUTHORS file for details. Use of this source code is governed
3 // by a BSD license that can be found in the LICENSE file.
9 #ifndef INCLUDED_SDSL_CONSTRUCT
10 #define INCLUDED_SDSL_CONSTRUCT
11 
12 #include <string>
13 
14 #include <sdsl/construct_bwt.hpp>
15 #include <sdsl/construct_lcp.hpp>
16 #include <sdsl/construct_sa.hpp>
17 #include <sdsl/int_vector.hpp>
18 #include <sdsl/sdsl_concepts.hpp>
19 
20 namespace sdsl
21 {
22 
23 template <class int_vector>
24 bool contains_no_zero_symbol(const int_vector & text, const std::string & file)
25 {
26  for (int_vector_size_type i = 0; i < text.size(); ++i)
27  {
28  if ((uint64_t)0 == text[i])
29  {
30  throw std::logic_error(std::string("Error: File \"") + file + "\" contains zero symbol.");
31  return false;
32  }
33  }
34  return true;
35 }
36 
37 template <class int_vector>
39 {
40  text.resize(text.size() + 1);
41  text[text.size() - 1] = 0;
42 }
43 
44 template <class t_index>
45 void construct(t_index & idx, std::string file, uint8_t num_bytes = 0, bool move_input = false)
46 {
47  tMSS file_map;
48  cache_config config;
49  if (is_ram_file(file))
50  {
51  config.dir = "@";
52  config.delete_data = move_input;
53  }
54  construct(idx, file, config, num_bytes);
55 }
56 
57 template <class t_index, class t_data>
58 void construct_im(t_index & idx, t_data && data, uint8_t num_bytes = 0)
59 {
61  store_to_file(data, tmp_file);
62  construct(idx, tmp_file, num_bytes, std::is_rvalue_reference<t_data &&>::value);
64 }
65 
67 
76 template <class t_index>
77 void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes = 0)
78 {
79  // delegate to CSA or CST construction
80  typename t_index::index_category index_tag;
81  construct(idx, file, config, num_bytes, index_tag);
82 }
83 
84 // Specialization for WTs
85 template <class t_index>
86 void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, wt_tag)
87 {
88  auto event = memory_monitor::event("construct wavelet tree");
89  if ((t_index::alphabet_category::WIDTH == 8 and num_bytes <= 1) or
90  (t_index::alphabet_category::WIDTH == 0 and num_bytes != 'd'))
91  {
93  std::ios::in,
94  1024 * 1024,
95  num_bytes * 8,
96  (bool)num_bytes);
97  idx = t_index(text_buf.begin(), text_buf.end(), config.dir);
98  }
99  else
100  {
102  load_vector_from_file(text, file, num_bytes);
103  std::string tmp_key = util::to_string(util::pid()) + "_" + util::to_string(util::id());
104  std::string tmp_file_name = cache_file_name(tmp_key, config);
105  store_to_file(text, tmp_file_name);
106  util::clear(text);
107  {
109  idx = t_index(text_buf.begin(), text_buf.end(), config.dir);
110  }
111  sdsl::remove(tmp_file_name);
112  }
113 }
114 
115 // Specialization for CSAs
116 template <class t_index>
117 void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, csa_tag)
118 {
119  auto event = memory_monitor::event("construct CSA");
120  constexpr uint8_t width = t_index::alphabet_category::WIDTH;
122  const char * KEY_BWT = key_bwt_trait<width>::KEY_BWT;
124  {
125  auto event = memory_monitor::event("parse input text");
126  // (1) check, if the text is cached
127  if (!cache_file_exists(KEY_TEXT, config))
128  {
129  text_type text;
130  load_vector_from_file(text, file, num_bytes);
131  if (contains_no_zero_symbol(text, file))
132  {
133  if (!is_ram_file(file))
134  {
135  append_zero_symbol(text);
136  store_to_cache(text, KEY_TEXT, config);
137  }
138  else
139  {
140  auto text_mapper = write_out_mapper<width>::create(cache_file_name(KEY_TEXT, config),
141  text.size() + 1,
142  text.width());
143  std::copy(text.begin(), text.end(), text_mapper.begin());
144  text_mapper[text.size()] = 0;
145  }
146  }
147  }
148  register_cache_file(KEY_TEXT, config);
149  }
150  if (config.delete_data) { sdsl::remove(file); }
151  {
152  // (2) check, if the suffix array is cached
153  auto event = memory_monitor::event("SA");
154  if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_index::alphabet_category::WIDTH>(config); }
156  }
157  {
158  // (3) construct BWT
159  auto event = memory_monitor::event("BWT");
160  if (!cache_file_exists(KEY_BWT, config)) { construct_bwt<t_index::alphabet_category::WIDTH>(config); }
161  register_cache_file(KEY_BWT, config);
162  }
163  {
164  // (4) use BWT to construct the CSA
165  auto event = memory_monitor::event("construct CSA");
166  idx = t_index(config);
167  }
168  if (config.delete_files)
169  {
170  auto event = memory_monitor::event("delete temporary files");
171  util::delete_all_files(config.file_map);
172  }
173 }
174 
175 // Specialization for standalone LCPs
176 template <class t_index, uint8_t t_width>
177 void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, lcp_tag)
178 {
179  auto event = memory_monitor::event("construct compressed LCP");
181  typedef int_vector<t_width> text_type;
182  {
183  // (2) check, if the longest common prefix array is cached
184  auto event = memory_monitor::event("LCP");
185  if (!cache_file_exists(conf::KEY_LCP, config))
186  {
187  {
188  auto event = memory_monitor::event("parse input text");
189  // (1) check, if the text is cached
190  if (!cache_file_exists(KEY_TEXT, config))
191  {
192  text_type text;
193  load_vector_from_file(text, file, num_bytes);
194  if (contains_no_zero_symbol(text, file))
195  {
196  append_zero_symbol(text);
197  store_to_cache(text, KEY_TEXT, config);
198  }
199  }
200  register_cache_file(KEY_TEXT, config);
201  }
202  {
203  // (2) check, if the suffix array is cached
204  auto event = memory_monitor::event("SA");
205  if (!cache_file_exists(conf::KEY_SA, config)) { construct_sa<t_width>(config); }
207  }
208  if (t_width == 8) { construct_lcp_semi_extern_PHI(config); }
209  else
210  {
211  construct_lcp_PHI<t_width>(config);
212  }
213  }
215  }
216  {
217  auto event = memory_monitor::event("compressed LCP");
218  idx = t_index(config);
219  }
220  if (config.delete_files)
221  {
222  auto event = memory_monitor::event("delete temporary files");
223  util::delete_all_files(config.file_map);
224  }
225 }
226 
227 // Specialization for standalone LCPs
228 template <class t_index>
229 void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, lcp_tag tag)
230 {
231  if (1 == num_bytes) { construct<t_index, 8>(idx, file, config, num_bytes, tag); }
232  else
233  {
234  construct<t_index, 0>(idx, file, config, num_bytes, tag);
235  }
236 }
237 
238 // Specialization for CSTs
239 template <class t_index>
240 void construct(t_index & idx, const std::string & file, cache_config & config, uint8_t num_bytes, cst_tag)
241 {
242  auto event = memory_monitor::event("construct CST");
245  csa_tag csa_t;
246  {
247  // (1) check, if the compressed suffix array is cached
248  typename t_index::csa_type csa;
249  if (!cache_file_exists(std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config))
250  {
251  cache_config csa_config(false, config.dir, config.id, config.file_map);
252  construct(csa, file, csa_config, num_bytes, csa_t);
253  auto event = memory_monitor::event("store CSA");
254  config.file_map = csa_config.file_map;
255  store_to_cache(csa, std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config);
256  }
257  register_cache_file(std::string(conf::KEY_CSA) + "_" + util::class_to_hash(csa), config);
258  }
259  {
260  // (2) check, if the longest common prefix array is cached
261  auto event = memory_monitor::event("LCP");
262  register_cache_file(KEY_TEXT, config);
263  register_cache_file(KEY_BWT, config);
265  if (!cache_file_exists(conf::KEY_LCP, config))
266  {
267  if (t_index::alphabet_category::WIDTH == 8) { construct_lcp_semi_extern_PHI(config); }
268  else
269  {
270  construct_lcp_PHI<t_index::alphabet_category::WIDTH>(config);
271  }
272  }
274  }
275  {
276  auto event = memory_monitor::event("CST");
277  idx = t_index(config);
278  }
279  if (config.delete_files)
280  {
281  auto event = memory_monitor::event("delete temporary files");
282  util::delete_all_files(config.file_map);
283  }
284 }
285 
286 } // end namespace sdsl
287 #endif
A generic vector class for integers of width .
Definition: int_vector.hpp:253
size_type size() const noexcept
The number of elements in the int_vector.
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:545
static mm_event_proxy event(const std::string &name)
static int_vector_mapper< t_width > create(const std::string &key, cache_config &config)
construct_bwt.hpp contains a space and time efficient construction method for the Burrows and Wheeler...
construct_lcp.hpp contains a space and time efficient construction method for lcp arrays
construct_sa.hpp contains an interface to access suffix array construction algorithms
int_vector.hpp contains the sdsl::int_vector class.
constexpr char KEY_CSA[]
Definition: config.hpp:38
constexpr char KEY_SA[]
Definition: config.hpp:37
constexpr char KEY_TEXT[]
Definition: config.hpp:41
constexpr char KEY_LCP[]
Definition: config.hpp:44
constexpr char KEY_BWT[]
Definition: config.hpp:35
int remove(const std::string &name)
Remove the file with key name
Definition: ram_fs.hpp:69
uint64_t id()
uint64_t pid()
std::string to_string(const T &t, int w=1)
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool is_ram_file(const std::string &file)
Determines if the given file is a RAM-file.
Definition: ram_fs.hpp:158
std::map< std::string, std::string > tMSS
Definition: config.hpp:50
std::string ram_file_name(const std::string &file)
Returns the corresponding RAM-file name for file.
Definition: ram_fs.hpp:174
bool load_vector_from_file(t_int_vec &v, const std::string &file, uint8_t num_bytes=1, uint8_t max_int_width=64)
from disk.
Definition: io.hpp:187
std::string tmp_file(const cache_config &config, std::string name_part="")
Returns a name for a temporary file. I.e. the name was not used before.
Definition: io.hpp:698
bool contains_no_zero_symbol(const int_vector &text, const std::string &file)
Definition: construct.hpp:24
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition: construct.hpp:45
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
uint64_t int_vector_size_type
Definition: config.hpp:48
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
bool cache_file_exists(const std::string &key, const cache_config &config)
Checks if the resource specified by the key exists in the cache.
Definition: io.hpp:672
bool store_to_cache(const T &v, const std::string &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition: io.hpp:737
int remove(const std::string &)
Remove a file.
Definition: ram_fs.hpp:194
void append_zero_symbol(int_vector &text)
Definition: construct.hpp:38
void construct_im(t_index &idx, t_data &&data, uint8_t num_bytes=0)
Definition: construct.hpp:58
Contains declarations and definitions of data structure concepts.
Helper class for construction process.
Definition: config.hpp:67
std::string id
Definition: config.hpp:72
std::string dir
Definition: config.hpp:71
Helper classes to transform width=0 and width=8 to corresponding bwt key.
Definition: config.hpp:110
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition: config.hpp:91