Orcus
csv_parser.hpp
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6  */
7 
8 #ifndef ORCUS_CSV_PARSER_HPP
9 #define ORCUS_CSV_PARSER_HPP
10 
11 #include "csv_parser_base.hpp"
12 
13 namespace orcus {
14 
16 {
17 public:
21  void begin_parse() {}
22 
26  void end_parse() {}
27 
31  void begin_row() {}
32 
36  void end_row() {}
37 
50  void cell(const char* p, size_t n, bool transient)
51  {
52  (void)p; (void)n; (void)transient;
53  }
54 };
55 
56 template<typename _Handler>
58 {
59 public:
60  typedef _Handler handler_type;
61 
62  csv_parser(const char* p, size_t n, handler_type& hdl, const csv::parser_config& config);
63  void parse();
64 
65 private:
66 
67  // handlers
68  void row();
69  void cell();
70  void quoted_cell();
71 
72  void parse_cell_with_quote(const char* p0, size_t len0);
73 
77  void push_cell_value(const char* p, size_t n);
78 
79 private:
80  handler_type& m_handler;
81 };
82 
83 template<typename _Handler>
85  const char* p, size_t n, handler_type& hdl, const csv::parser_config& config) :
86  csv::parser_base(p, n, config), m_handler(hdl) {}
87 
88 template<typename _Handler>
89 void csv_parser<_Handler>::parse()
90 {
91 #if ORCUS_DEBUG_CSV
92  for (const char* p = mp_begin; p < mp_end; ++p)
93  std::cout << *p;
94  std::cout << std::endl;
95 #endif
96 
97  m_handler.begin_parse();
98  while (has_char())
99  row();
100  m_handler.end_parse();
101 }
102 
103 template<typename _Handler>
104 void csv_parser<_Handler>::row()
105 {
106  m_handler.begin_row();
107  while (true)
108  {
109  if (is_text_qualifier(cur_char()))
110  quoted_cell();
111  else
112  cell();
113 
114  if (!has_char())
115  {
116  m_handler.end_row();
117  return;
118  }
119 
120  char c = cur_char();
121  if (c == '\n')
122  {
123  next();
124 #if ORCUS_DEBUG_CSV
125  cout << "(LF)" << endl;
126 #endif
127  m_handler.end_row();
128  return;
129  }
130 
131  if (!is_delim(c))
132  throw orcus::csv::parse_error("expected a delimiter");
133 
134  next();
135 
136  if (m_config.trim_cell_value)
137  skip_blanks();
138 
139  if (!has_char())
140  {
141  m_handler.end_row();
142  return;
143  }
144  }
145 }
146 
147 template<typename _Handler>
148 void csv_parser<_Handler>::cell()
149 {
150  const char* p = mp_char;
151  size_t len = 0;
152  char c = cur_char();
153  while (c != '\n' && !is_delim(c))
154  {
155  ++len;
156  next();
157  if (!has_char())
158  break;
159  c = cur_char();
160  }
161 
162  if (!len)
163  p = nullptr;
164 
165  push_cell_value(p, len);
166 }
167 
168 template<typename _Handler>
169 void csv_parser<_Handler>::quoted_cell()
170 {
171 #if ORCUS_DEBUG_CSV
172  cout << "--- quoted cell" << endl;
173 #endif
174  char c = cur_char();
175  assert(is_text_qualifier(c));
176  next(); // Skip the opening quote.
177  if (!has_char())
178  return;
179 
180  const char* p0 = mp_char;
181  size_t len = 1;
182  for (; has_char(); next(), ++len)
183  {
184  c = cur_char();
185 #if ORCUS_DEBUG_CSV
186  cout << "'" << c << "'" << endl;
187 #endif
188  if (!is_text_qualifier(c))
189  continue;
190 
191  // current char is a quote. Check if the next char is also a text
192  // qualifier.
193 
194  if (has_next() && is_text_qualifier(next_char()))
195  {
196  next();
197  parse_cell_with_quote(p0, len);
198  return;
199  }
200 
201  // Closing quote.
202  m_handler.cell(p0, len-1, false);
203  next();
204  skip_blanks();
205  return;
206  }
207 
208  // Stream ended prematurely. Handle it gracefully.
209  m_handler.cell(p0, len, false);
210 }
211 
212 template<typename _Handler>
213 void csv_parser<_Handler>::parse_cell_with_quote(const char* p0, size_t len0)
214 {
215 #if ORCUS_DEBUG_CSV
216  using namespace std;
217  cout << "--- parse cell with quote" << endl;
218 #endif
219  assert(is_text_qualifier(cur_char()));
220 
221  // Push the preceding chars to the temp buffer.
222  m_cell_buf.reset();
223  m_cell_buf.append(p0, len0);
224 
225  // Parse the rest, until the closing quote.
226  next();
227  const char* p_cur = mp_char;
228  size_t cur_len = 0;
229  for (; has_char(); next(), ++cur_len)
230  {
231  char c = cur_char();
232 #if ORCUS_DEBUG_CSV
233  cout << "'" << c << "'" << endl;
234 #endif
235  if (!is_text_qualifier(c))
236  continue;
237 
238  if (has_next() && is_text_qualifier(next_char()))
239  {
240  // double quotation. Copy the current segment to the cell buffer.
241  m_cell_buf.append(p_cur, cur_len);
242 
243  next(); // to the 2nd quote.
244  p_cur = mp_char;
245  cur_len = 0;
246  continue;
247  }
248 
249  // closing quote. Flush the current segment to the cell
250  // buffer, push the value to the handler, and exit normally.
251  m_cell_buf.append(p_cur, cur_len);
252 
253  m_handler.cell(m_cell_buf.get(), m_cell_buf.size(), true);
254  next();
255  skip_blanks();
256  return;
257  }
258 
259  // Stream ended prematurely.
260  throw csv::parse_error("stream ended prematurely while parsing quoted cell.");
261 }
262 
263 template<typename _Handler>
264 void csv_parser<_Handler>::push_cell_value(const char* p, size_t n)
265 {
266  size_t len = n;
267 
268  if (m_config.trim_cell_value)
269  {
270  // Trim any leading blanks.
271  for (size_t i = 0; i < n; ++i, --len, ++p)
272  {
273  if (!is_blank(*p))
274  break;
275  }
276 
277  // Trim any trailing blanks.
278  if (len)
279  {
280  const char* p_end = p + (len-1);
281  for (; p != p_end; --p_end, --len)
282  {
283  if (!is_blank(*p_end))
284  break;
285  }
286  }
287  }
288 
289  m_handler.cell(p, len, false);
290 #if ORCUS_DEBUG_CSV
291  if (len)
292  cout << "(cell:'" << std::string(p, len) << "')" << endl;
293  else
294  cout << "(cell:'')" << endl;
295 #endif
296 }
297 
298 }
299 
300 #endif
301 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: csv_parser_base.hpp:58
Definition: csv_parser_base.hpp:67
Definition: csv_parser.hpp:16
void end_row()
Definition: csv_parser.hpp:36
void end_parse()
Definition: csv_parser.hpp:26
void begin_row()
Definition: csv_parser.hpp:31
void cell(const char *p, size_t n, bool transient)
Definition: csv_parser.hpp:50
void begin_parse()
Definition: csv_parser.hpp:21
Definition: csv_parser.hpp:58
Definition: parser_base.hpp:41
Definition: config.hpp:20
Definition: csv_parser_base.hpp:37