bes  Updated for version 3.20.10
url_impl.cc
1 
2 // -*- mode: c++; c-basic-offset:4 -*-
3 
4 // This file is part of the BES http package, part of the Hyrax data server.
5 
6 // Copyright (c) 2020 OPeNDAP, Inc.
7 // Author: Nathan Potter <ndp@opendap.org>
8 //
9 // This library is free software; you can redistribute it and/or
10 // modify it under the terms of the GNU Lesser General Public
11 // License as published by the Free Software Foundation; either
12 // version 2.1 of the License, or (at your option) any later version.
13 //
14 // This library is distributed in the hope that it will be useful,
15 // but WITHOUT ANY WARRANTY; without even the implied warranty of
16 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 // Lesser General Public License for more details.
18 //
19 // You should have received a copy of the GNU Lesser General Public
20 // License along with this library; if not, write to the Free Software
21 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 //
23 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
24 
25 // Authors:
26 // ndp Nathan Potter <ndp@opendap.org>
27 
28 #include "config.h"
29 
30 #include <string>
31 #include <sstream>
32 #include <map>
33 #include <vector>
34 #include <algorithm>
35 #include <cctype>
36 #include <functional>
37 #include <time.h>
38 
39 #include "BESDebug.h"
40 #include "BESUtil.h"
41 #include "BESCatalogList.h"
42 #include "HttpNames.h"
43 
44 #include "url_impl.h"
45 
46 using namespace std;
47 using std::chrono::system_clock;
48 
49 #define MODULE HTTP_MODULE
50 #define prolog string("url::").append(__func__).append("() - ")
51 
52 #define PROTOCOL_KEY "http_url_protocol"
53 #define HOST_KEY "http_url_host"
54 #define PATH_KEY "http_url_path"
55 #define QUERY_KEY "http_url_query"
56 #define SOURCE_URL_KEY "http_url_target_url"
57 #define INGEST_TIME_KEY "http_url_ingest_time"
58 
59 
60 namespace http {
61 
62 #if 0
67 url::url(const map<string,string> &kvp)
68 {
69  map<string,string> kvp_copy = kvp;
70  map<string,string>::const_iterator it;
71  map<string,string>::const_iterator itc;
72 
73  it = kvp.find(PROTOCOL_KEY);
74  itc = kvp_copy.find(PROTOCOL_KEY);
75  if(it != kvp.end() && itc != kvp_copy.end()){
76  d_protocol = it->second;
77  kvp_copy.erase(it->first);
78  BESDEBUG(MODULE, prolog << "Located PROTOCOL_KEY(" << PROTOCOL_KEY << ") value: " << d_protocol << endl);
79  }
80  it = kvp.find(HOST_KEY);
81  itc = kvp_copy.find(HOST_KEY);
82  if(it != kvp.end() && itc != kvp_copy.end()){
83  d_host = it->second;
84  kvp_copy.erase(it->first);
85  BESDEBUG(MODULE, prolog << "Located HOST_KEY(" << HOST_KEY << ") value: " << d_host << endl);
86  }
87  it = kvp.find(PATH_KEY);
88  itc = kvp_copy.find(PATH_KEY);
89  if(it != kvp.end() && itc != kvp_copy.end()){
90  d_path = it->second;
91  kvp_copy.erase(it->first);
92  BESDEBUG(MODULE, prolog << "Located PATH_KEY(" << PATH_KEY << ") value: " << d_path << endl);
93  }
94  it = kvp.find(QUERY_KEY);
95  itc = kvp_copy.find(QUERY_KEY);
96  if(it != kvp.end() && itc != kvp_copy.end()){
97  d_query = it->second;
98  kvp_copy.erase(it->first);
99  BESDEBUG(MODULE, prolog << "Located QUERY_KEY(" << QUERY_KEY << ") value: " << d_query << endl);
100  }
101  it = kvp.find(SOURCE_URL_KEY);
102  itc = kvp_copy.find(SOURCE_URL_KEY);
103  if(it != kvp.end() && itc != kvp_copy.end()){
104  d_source_url_str = it->second;
105  kvp_copy.erase(it->first);
106  BESDEBUG(MODULE, prolog << "Located SOURCE_URL_KEY(" << SOURCE_URL_KEY << ") value: " << d_source_url_str << endl);
107  }
108 
109  for(itc = kvp_copy.begin(); itc != kvp_copy.end(); itc++){
110  string key = itc->first;
111  string value = itc->second;
112  map<string, vector<string>* >::const_iterator record_it;
113  record_it = d_query_kvp.find(key);
114  if(record_it != d_query_kvp.end()){
115  vector<string> *values = record_it->second;
116  values->push_back(value);
117  }
118  else {
119  vector<string> *values = new vector<string>();
120  values->push_back(value);
121  d_query_kvp.insert(pair<string, vector<string>*>(key, values));
122  }
123  }
124 
125 }
126 #endif
127 
131 url::~url()
132 {
133  if(!d_query_kvp.empty()){
134  map<string, vector<string>* >::const_iterator it;
135  for(it = d_query_kvp.begin() ; it != d_query_kvp.end(); it++){
136  delete it->second;
137  }
138  }
139 }
140 
141 
149 void url::parse() {
150  const string protocol_end("://");
151  BESDEBUG(MODULE, prolog << "BEGIN (parsing: '" << d_source_url_str << "')" << endl);
152 
153  // If the supplied string does not start with a protocol, we assume it must be a
154  // path relative the BES.Catalog.catalog.RootDirectory because that's the only
155  // thing we are going to allow, even when it starts with slash '/'. Basically
156  // we force it to be in the BES.Catalog.catalog.RootDirectory tree.
157  if(d_source_url_str.find(protocol_end) == string::npos){
158  // Since we want a valid path in the file system tree for data, we make it so by adding
159  // the file path that starts with the catalog root dir.
161  string default_catalog_name = bcl->default_catalog_name();
162  BESDEBUG(MODULE, prolog << "Searching for catalog: " << default_catalog_name << endl);
163  BESCatalog *bcat = bcl->find_catalog(default_catalog_name);
164  if (bcat) {
165  BESDEBUG(MODULE, prolog << "Found catalog: " << bcat->get_catalog_name() << endl);
166  } else {
167  string msg = "OUCH! Unable to locate default catalog!";
168  BESDEBUG(MODULE, prolog << msg << endl);
169  throw BESInternalError(msg, __FILE__, __LINE__);
170  }
171  string catalog_root = bcat->get_root();
172  BESDEBUG(MODULE, prolog << "Catalog root: " << catalog_root << endl);
173 
174  string file_path = BESUtil::pathConcat(catalog_root,d_source_url_str);
175  if(file_path[0] != '/')
176  file_path = "/" + file_path;
177  d_source_url_str = FILE_PROTOCOL + file_path;
178  }
179 
180  const string parse_url_target(d_source_url_str);
181 
182  string::const_iterator prot_i = search(parse_url_target.begin(), parse_url_target.end(),
183  protocol_end.begin(), protocol_end.end());
184 
185  if (prot_i != parse_url_target.end())
186  advance(prot_i, protocol_end.length());
187 
188  d_protocol.reserve(distance(parse_url_target.begin(), prot_i));
189  transform(parse_url_target.begin(), prot_i,
190  back_inserter(d_protocol),
191  ptr_fun<int, int>(tolower)); // protocol is icase
192  if (prot_i == parse_url_target.end())
193  return;
194 
195  if (d_protocol == FILE_PROTOCOL) {
196  d_path = parse_url_target.substr(d_protocol.length());
197  BESDEBUG(MODULE, prolog << "FILE_PROTOCOL d_path: " << d_path << endl);
198  }
199  else if( d_protocol == HTTP_PROTOCOL || d_protocol == HTTPS_PROTOCOL){
200  string::const_iterator path_i = find(prot_i, parse_url_target.end(), '/');
201  d_host.reserve(distance(prot_i, path_i));
202  transform(prot_i, path_i,
203  back_inserter(d_host),
204  ptr_fun<int, int>(tolower)); // host is icase
205  string::const_iterator query_i = find(path_i, parse_url_target.end(), '?');
206  d_path.assign(path_i, query_i);
207  if (query_i != parse_url_target.end())
208  ++query_i;
209  d_query.assign(query_i, parse_url_target.end());
210 
211  if (!d_query.empty()) {
212  vector<string> records;
213  string delimiters = "&";
214  BESUtil::tokenize(d_query, records, delimiters);
215  vector<string>::iterator i = records.begin();
216  for (; i != records.end(); i++) {
217  size_t index = i->find('=');
218  if (index != string::npos) {
219  string key = i->substr(0, index);
220  string value = i->substr(index + 1);
221  BESDEBUG(MODULE, prolog << "key: " << key << " value: " << value << endl);
222  map<string, vector<string> *>::const_iterator record_it;
223  record_it = d_query_kvp.find(key);
224  if (record_it != d_query_kvp.end()) {
225  vector<string> *values = record_it->second;
226  values->push_back(value);
227  } else {
228  vector<string> *values = new vector<string>();
229  values->push_back(value);
230  d_query_kvp.insert(pair<string, vector<string> *>(key, values));
231  }
232  }
233  }
234  }
235  }
236  else {
237  stringstream msg;
238  msg << prolog << "Unsupported URL protocol " << d_protocol << " found in URL: " << d_source_url_str;
239  BESDEBUG(MODULE, msg.str() << endl);
240  throw BESInternalError(msg.str(), __FILE__, __LINE__);
241  }
242  BESDEBUG(MODULE, prolog << "END (parsing: '" << d_source_url_str << "')" << endl);
243 
244 }
245 
246 
252 string url::query_parameter_value(const string &key) const
253 {
254  string value;
255  map<string, vector<string>* >::const_iterator it;
256  it = d_query_kvp.find(key);
257  if(it != d_query_kvp.end()){
258  vector<string> *values = it->second;
259  if(!values->empty()){
260  value = (*values)[0];
261  }
262  }
263  return value;
264 }
265 
271 void url::query_parameter_values(const string &key, vector<string> &values) const
272 {
273  map<string, vector<string>* >::const_iterator it;
274  it = d_query_kvp.find(key);
275  if(it != d_query_kvp.end()){
276  values = *it->second;
277  }
278 }
279 
280 #if 0
281 
286 void url::kvp(map<string,string> &kvp){
287  stringstream ss;
288 
289  // Do the basic stuff
290  kvp.insert(pair<string,string>(PROTOCOL_KEY, d_protocol));
291  kvp.insert(pair<string,string>(HOST_KEY, d_host));
292  kvp.insert(pair<string,string>(PATH_KEY, d_path));
293  kvp.insert(pair<string,string>(QUERY_KEY, d_query));
294  kvp.insert(pair<string,string>(SOURCE_URL_KEY, d_source_url_str));
295  ss << d_ingest_time;
296  kvp.insert(pair<string,string>(INGEST_TIME_KEY,ss.str()));
297 
298  // Now grab the query string. Only the first value of multi valued keys is used.
299  map<string, vector<string>* >::const_iterator it;
300  for(it=d_query_kvp.begin(); it != d_query_kvp.end(); it++){
301  kvp.insert(pair<string,string>(it->first,(*it->second)[0]));
302  }
303 }
304 #endif
305 
312 bool url::is_expired()
313 {
314 
315  bool stale;
316  std::time_t now = system_clock::to_time_t(system_clock::now());
317 
318  BESDEBUG(MODULE, prolog << "now: " << now << endl);
319  // We set the expiration time to the default, in case other avenues don't work out so well.
320  std::time_t expires_time = ingest_time() + HTTP_EFFECTIVE_URL_DEFAULT_EXPIRES_INTERVAL;
321 
322  string cf_expires = query_parameter_value(CLOUDFRONT_EXPIRES_HEADER_KEY);
323  string aws_expires_str = query_parameter_value(AMS_EXPIRES_HEADER_KEY);
324 
325  if(!cf_expires.empty()){ // CloudFront expires header?
326  std::istringstream(cf_expires) >> expires_time;
327  BESDEBUG(MODULE, prolog << "Using "<< CLOUDFRONT_EXPIRES_HEADER_KEY << ": " << expires_time << endl);
328  }
329  else if(!aws_expires_str.empty()){
330 
331  long long aws_expires;
332  std::istringstream(aws_expires_str) >> aws_expires;
333  // AWS Expires header?
334  //
335  // By default we'll use the time we made the URL object, ingest_time
336  std::time_t aws_start_time = ingest_time();
337 
338  // But if there's an AWS Date we'll parse that and compute the time
339  // @TODO move to NgapApi::decompose_url() and add the result to the map
340  string aws_date = query_parameter_value(AWS_DATE_HEADER_KEY);
341 
342  if(!aws_date.empty()){
343 
344  string date = aws_date; // 20200624T175046Z
345  string year = date.substr(0,4);
346  string month = date.substr(4,2);
347  string day = date.substr(6,2);
348  string hour = date.substr(9,2);
349  string minute = date.substr(11,2);
350  string second = date.substr(13,2);
351 
352  BESDEBUG(MODULE, prolog << "date: "<< date <<
353  " year: " << year << " month: " << month << " day: " << day <<
354  " hour: " << hour << " minute: " << minute << " second: " << second << endl);
355 
356  std::time_t old_now;
357  time(&old_now); /* get current time; same as: timer = time(NULL) */
358  BESDEBUG(MODULE, prolog << "old_now: " << old_now << endl);
359  struct tm *ti = gmtime(&old_now);
360  ti->tm_year = stoll(year) - 1900;
361  ti->tm_mon = stoll(month) - 1;
362  ti->tm_mday = stoll(day);
363  ti->tm_hour = stoll(hour);
364  ti->tm_min = stoll(minute);
365  ti->tm_sec = stoll(second);
366 
367  BESDEBUG(MODULE, prolog << "ti->tm_year: "<< ti->tm_year <<
368  " ti->tm_mon: " << ti->tm_mon <<
369  " ti->tm_mday: " << ti->tm_mday <<
370  " ti->tm_hour: " << ti->tm_hour <<
371  " ti->tm_min: " << ti->tm_min <<
372  " ti->tm_sec: " << ti->tm_sec << endl);
373 
374 
375  aws_start_time = mktime(ti);
376  BESDEBUG(MODULE, prolog << "AWS start_time (computed): " << aws_start_time << endl);
377  }
378 
379  expires_time = aws_start_time + aws_expires;
380  BESDEBUG(MODULE, prolog << "Using "<< AMS_EXPIRES_HEADER_KEY << ": " << aws_expires <<
381  " (expires_time: " << expires_time << ")" << endl);
382  }
383  std::time_t remaining = expires_time - now;
384  BESDEBUG(MODULE, prolog << "expires_time: " << expires_time <<
385  " remaining: " << remaining <<
386  " threshold: " << HTTP_URL_REFRESH_THRESHOLD << endl);
387 
388  stale = remaining < HTTP_URL_REFRESH_THRESHOLD;
389  BESDEBUG(MODULE, prolog << "stale: " << (stale?"true":"false") << endl);
390 
391  return stale;
392 }
393 
398 string url::dump(){
399  stringstream ss;
400  string indent_inc = " ";
401  string indent = indent_inc;
402 
403  ss << "http::url [" << this << "] " << endl;
404  ss << indent << "d_source_url_str: " << d_source_url_str << endl;
405  ss << indent << "d_protocol: " << d_protocol << endl;
406  ss << indent << "d_host: " << d_host << endl;
407  ss << indent << "d_path: " << d_path << endl;
408  ss << indent << "d_query: " << d_query << endl;
409 
410  std::map<std::string, std::vector<std::string>* >::iterator it;
411 
412  string idt = indent+indent_inc;
413  for(it=d_query_kvp.begin(); it !=d_query_kvp.end(); it++){
414  ss << indent << "d_query_kvp["<<it->first<<"]: " << endl;
415  std::vector<std::string> *values = it->second;
416  for(size_t i=0; i<values->size(); i++){
417  ss << idt << "value[" << i << "]: " << (*values)[i] << endl;
418  }
419  }
420  ss << indent << "d_ingest_time: " << d_ingest_time.time_since_epoch().count() << endl;
421  return ss.str();
422 }
423 
424 } // namespace http
List of all registered catalogs.
virtual std::string default_catalog_name() const
The name of the default catalog.
static BESCatalogList * TheCatalogList()
Get the singleton BESCatalogList instance.
Catalogs provide a hierarchical organization for data.
Definition: BESCatalog.h:51
virtual std::string get_root() const =0
virtual std::string get_catalog_name() const
Get the name for this catalog.
Definition: BESCatalog.h:103
exception thrown if internal error encountered
static void tokenize(const std::string &str, std::vector< std::string > &tokens, const std::string &delimiters="/")
Definition: BESUtil.cc:1086
static std::string pathConcat(const std::string &firstPart, const std::string &secondPart, char separator='/')
Concatenate path fragments making sure that they are separated by a single '/' character.
Definition: BESUtil.cc:791
utility class for the HTTP catalog module
Definition: AllowedHosts.cc:55