Orcus
sax_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
9 #define INCLUDED_ORCUS_SAX_PARSER_HPP
10 
11 #include "sax_parser_base.hpp"
12 
13 #include <string_view>
14 
15 namespace orcus {
16 
18 {
24  static const uint8_t baseline_version = 10;
25 };
26 
28 {
29 public:
36  {
37  (void)param;
38  }
39 
47  void start_declaration(std::string_view decl)
48  {
49  (void)decl;
50  }
51 
57  void end_declaration(std::string_view decl)
58  {
59  (void)decl;
60  }
61 
68  {
69  (void)elem;
70  }
71 
78  {
79  (void)elem;
80  }
81 
96  void characters(std::string_view val, bool transient)
97  {
98  (void)val; (void)transient;
99  }
100 
110  {
111  (void)attr;
112  }
113 };
114 
119 template<typename _Handler, typename _Config = sax_parser_default_config>
121 {
122 public:
123  typedef _Handler handler_type;
124  typedef _Config config_type;
125 
126  sax_parser(const char* content, const size_t size, handler_type& handler);
127  sax_parser(const char* content, const size_t size, bool transient_stream, handler_type& handler);
128  ~sax_parser();
129 
130  void parse();
131 
132 private:
133 
138  void header();
139  void body();
140  void element();
141  void element_open(std::ptrdiff_t begin_pos);
142  void element_close(std::ptrdiff_t begin_pos);
143  void special_tag();
144  void declaration(const char* name_check);
145  void cdata();
146  void doctype();
147  void characters();
148  void attribute();
149 
150 private:
151  handler_type& m_handler;
152 };
153 
154 template<typename _Handler, typename _Config>
156  const char* content, const size_t size, handler_type& handler) :
157  sax::parser_base(content, size, false),
158  m_handler(handler)
159 {
160 }
161 
162 template<typename _Handler, typename _Config>
163 sax_parser<_Handler,_Config>::sax_parser(
164  const char* content, const size_t size, bool transient_stream, handler_type& handler) :
165  sax::parser_base(content, size, transient_stream),
166  m_handler(handler)
167 {
168 }
169 
170 template<typename _Handler, typename _Config>
171 sax_parser<_Handler,_Config>::~sax_parser()
172 {
173 }
174 
175 template<typename _Handler, typename _Config>
176 void sax_parser<_Handler,_Config>::parse()
177 {
178  m_nest_level = 0;
179  mp_char = mp_begin;
180  header();
181  skip_space_and_control();
182  body();
183 
184  assert(m_buffer_pos == 0);
185 }
186 
187 template<typename _Handler, typename _Config>
188 void sax_parser<_Handler,_Config>::header()
189 {
190  // we don't handle multi byte encodings so we can just skip bom entry if exists.
191  skip_bom();
192  skip_space_and_control();
193  if (!has_char() || cur_char() != '<')
194  throw sax::malformed_xml_error("xml file must begin with '<'.", offset());
195 
196  if (config_type::baseline_version >= 11)
197  {
198  // XML version 1.1 requires a header declaration whereas in 1.0 it's
199  // optional.
200  if (next_char_checked() != '?')
201  throw sax::malformed_xml_error("xml file must begin with '<?'.", offset());
202 
203  declaration("xml");
204  }
205 }
206 
207 template<typename _Handler, typename _Config>
208 void sax_parser<_Handler,_Config>::body()
209 {
210  while (has_char())
211  {
212  if (cur_char() == '<')
213  {
214  element();
215  if (!m_root_elem_open)
216  // Root element closed. Stop parsing.
217  return;
218  }
219  else if (m_nest_level)
220  // Call characters only when in xml hierarchy.
221  characters();
222  else
223  next();
224  }
225 }
226 
227 template<typename _Handler, typename _Config>
228 void sax_parser<_Handler,_Config>::element()
229 {
230  assert(cur_char() == '<');
231  std::ptrdiff_t pos = offset();
232  char c = next_char_checked();
233  switch (c)
234  {
235  case '/':
236  element_close(pos);
237  return;
238  case '!':
239  special_tag();
240  return;
241  case '?':
242  declaration(nullptr);
243  return;
244  }
245 
246  element_open(pos);
247 }
248 
249 template<typename _Handler, typename _Config>
250 void sax_parser<_Handler,_Config>::element_open(std::ptrdiff_t begin_pos)
251 {
252  sax::parser_element elem;
253  element_name(elem, begin_pos);
254 
255  while (true)
256  {
257  skip_space_and_control();
258  char c = cur_char();
259  if (c == '/')
260  {
261  // Self-closing element: <element/>
262  if (next_and_char() != '>')
263  throw sax::malformed_xml_error("expected '/>' to self-close the element.", offset());
264  next();
265  elem.end_pos = offset();
266  m_handler.start_element(elem);
267  reset_buffer_pos();
268  m_handler.end_element(elem);
269  if (!m_nest_level)
270  m_root_elem_open = false;
271 #if ORCUS_DEBUG_SAX_PARSER
272  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
273 #endif
274  return;
275  }
276  else if (c == '>')
277  {
278  // End of opening element: <element>
279  next();
280  elem.end_pos = offset();
281  nest_up();
282  m_handler.start_element(elem);
283  reset_buffer_pos();
284 #if ORCUS_DEBUG_SAX_PARSER
285  cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
286 #endif
287  return;
288  }
289  else
290  attribute();
291  }
292 }
293 
294 template<typename _Handler, typename _Config>
295 void sax_parser<_Handler,_Config>::element_close(std::ptrdiff_t begin_pos)
296 {
297  assert(cur_char() == '/');
298  nest_down();
299  next_check();
300  sax::parser_element elem;
301  element_name(elem, begin_pos);
302 
303  if (cur_char() != '>')
304  throw sax::malformed_xml_error("expected '>' to close the element.", offset());
305  next();
306  elem.end_pos = offset();
307 
308  m_handler.end_element(elem);
309 #if ORCUS_DEBUG_SAX_PARSER
310  cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
311 #endif
312  if (!m_nest_level)
313  m_root_elem_open = false;
314 }
315 
316 template<typename _Handler, typename _Config>
317 void sax_parser<_Handler,_Config>::special_tag()
318 {
319  assert(cur_char() == '!');
320  // This can be either <![CDATA, <!--, or <!DOCTYPE.
321  size_t len = remains();
322  if (len < 2)
323  throw sax::malformed_xml_error("special tag too short.", offset());
324 
325  switch (next_and_char())
326  {
327  case '-':
328  {
329  // Possibly comment.
330  if (next_and_char() != '-')
331  throw sax::malformed_xml_error("comment expected.", offset());
332 
333  len -= 2;
334  if (len < 3)
335  throw sax::malformed_xml_error("malformed comment.", offset());
336 
337  next();
338  comment();
339  }
340  break;
341  case '[':
342  {
343  // Possibly a CDATA.
344  expects_next("CDATA[", 6);
345  if (has_char())
346  cdata();
347  }
348  break;
349  case 'D':
350  {
351  // check if this is a DOCTYPE.
352  expects_next("OCTYPE", 6);
353  skip_space_and_control();
354  if (has_char())
355  doctype();
356  }
357  break;
358  default:
359  throw sax::malformed_xml_error("failed to parse special tag.", offset());
360  }
361 }
362 
363 template<typename _Handler, typename _Config>
364 void sax_parser<_Handler,_Config>::declaration(const char* name_check)
365 {
366  assert(cur_char() == '?');
367  next_check();
368 
369  // Get the declaration name first.
370  std::string_view decl_name;
371  name(decl_name);
372 #if ORCUS_DEBUG_SAX_PARSER
373  cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
374 #endif
375 
376  if (name_check && decl_name != name_check)
377  {
378  std::ostringstream os;
379  os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
380  throw sax::malformed_xml_error(os.str(), offset());
381  }
382 
383  m_handler.start_declaration(decl_name);
384  skip_space_and_control();
385 
386  // Parse the attributes.
387  while (cur_char_checked() != '?')
388  {
389  attribute();
390  skip_space_and_control();
391  }
392  if (next_char_checked() != '>')
393  throw sax::malformed_xml_error("declaration must end with '?>'.", offset());
394 
395  m_handler.end_declaration(decl_name);
396  reset_buffer_pos();
397  next();
398 #if ORCUS_DEBUG_SAX_PARSER
399  cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
400 #endif
401 }
402 
403 template<typename _Handler, typename _Config>
404 void sax_parser<_Handler,_Config>::cdata()
405 {
406  size_t len = remains();
407  assert(len > 3);
408 
409  // Parse until we reach ']]>'.
410  const char* p0 = mp_char;
411  size_t i = 0, match = 0;
412  for (char c = cur_char(); i < len; ++i, c = next_and_char())
413  {
414  if (c == ']')
415  {
416  // Be aware that we may encounter a series of more than two ']'
417  // characters, in which case we'll only count the last two.
418 
419  if (match == 0)
420  // First ']'
421  ++match;
422  else if (match == 1)
423  // Second ']'
424  ++match;
425  }
426  else if (c == '>' && match == 2)
427  {
428  // Found ']]>'.
429  size_t cdata_len = i - 2;
430  m_handler.characters(std::string_view(p0, cdata_len), transient_stream());
431  next();
432  return;
433  }
434  else
435  match = 0;
436  }
437  throw sax::malformed_xml_error("malformed CDATA section.", offset());
438 }
439 
440 template<typename _Handler, typename _Config>
441 void sax_parser<_Handler,_Config>::doctype()
442 {
443  // Parse the root element first.
444  sax::doctype_declaration param;
445  name(param.root_element);
446  skip_space_and_control();
447 
448  // Either PUBLIC or SYSTEM.
449  size_t len = remains();
450  if (len < 6)
451  throw sax::malformed_xml_error("DOCTYPE section too short.", offset());
452 
453  param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
454  char c = cur_char();
455  if (c == 'P')
456  {
457  if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
458  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
459 
460  param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
461  }
462  else if (c == 'S')
463  {
464  if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
465  throw sax::malformed_xml_error("malformed DOCTYPE section.", offset());
466  }
467 
468  next_check();
469  skip_space_and_control();
470  has_char_throw("DOCTYPE section too short.");
471 
472  // Parse FPI.
473  value(param.fpi, false);
474 
475  has_char_throw("DOCTYPE section too short.");
476  skip_space_and_control();
477  has_char_throw("DOCTYPE section too short.");
478 
479  if (cur_char() == '>')
480  {
481  // Optional URI not given. Exit.
482 #if ORCUS_DEBUG_SAX_PARSER
483  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
484 #endif
485  m_handler.doctype(param);
486  next();
487  return;
488  }
489 
490  // Parse optional URI.
491  value(param.uri, false);
492 
493  has_char_throw("DOCTYPE section too short.");
494  skip_space_and_control();
495  has_char_throw("DOCTYPE section too short.");
496 
497  if (cur_char() != '>')
498  throw sax::malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
499 
500 #if ORCUS_DEBUG_SAX_PARSER
501  cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
502 #endif
503  m_handler.doctype(param);
504  next();
505 }
506 
507 template<typename _Handler, typename _Config>
508 void sax_parser<_Handler,_Config>::characters()
509 {
510  const char* p0 = mp_char;
511  for (; has_char(); next())
512  {
513  if (cur_char() == '<')
514  break;
515 
516  if (cur_char() == '&')
517  {
518  // Text span with one or more encoded characters. Parse using cell buffer.
519  cell_buffer& buf = get_cell_buffer();
520  buf.reset();
521  buf.append(p0, mp_char-p0);
522  characters_with_encoded_char(buf);
523  if (buf.empty())
524  m_handler.characters(std::string_view{}, transient_stream());
525  else
526  m_handler.characters(std::string_view(buf.get(), buf.size()), true);
527  return;
528  }
529  }
530 
531  if (mp_char > p0)
532  {
533  std::string_view val(p0, mp_char-p0);
534  m_handler.characters(val, transient_stream());
535  }
536 }
537 
538 template<typename _Handler, typename _Config>
539 void sax_parser<_Handler,_Config>::attribute()
540 {
541  sax::parser_attribute attr;
542  attribute_name(attr.ns, attr.name);
543 
544 #if ORCUS_DEBUG_SAX_PARSER
545  cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
546 #endif
547 
548  skip_space_and_control();
549 
550  char c = cur_char();
551  if (c != '=')
552  {
553  std::ostringstream os;
554  os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
555  throw sax::malformed_xml_error(os.str(), offset());
556  }
557 
558  next_check(); // skip the '='.
559  skip_space_and_control();
560 
561  attr.transient = value(attr.value, true);
562  if (attr.transient)
563  // Value is stored in a temporary buffer. Push a new buffer.
564  inc_buffer_pos();
565 
566 #if ORCUS_DEBUG_SAX_PARSER
567  cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
568 #endif
569 
570  m_handler.attribute(attr);
571 }
572 
573 }
574 
575 #endif
576 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: parser_base.hpp:41
Definition: sax_parser_base.hpp:108
Definition: sax_parser.hpp:28
void end_declaration(std::string_view decl)
Definition: sax_parser.hpp:57
void doctype(const orcus::sax::doctype_declaration &param)
Definition: sax_parser.hpp:35
void attribute(const orcus::sax::parser_attribute &attr)
Definition: sax_parser.hpp:109
void characters(std::string_view val, bool transient)
Definition: sax_parser.hpp:96
void start_declaration(std::string_view decl)
Definition: sax_parser.hpp:47
void end_element(const orcus::sax::parser_element &elem)
Definition: sax_parser.hpp:77
void start_element(const orcus::sax::parser_element &elem)
Definition: sax_parser.hpp:67
Definition: sax_parser.hpp:121
Definition: sax_parser_base.hpp:45
Definition: sax_parser_base.hpp:100
Definition: sax_parser_base.hpp:85
Definition: sax_parser.hpp:18
static const uint8_t baseline_version
Definition: sax_parser.hpp:24