http_client: rewrite header parsing manually for speed
boost::regex is stupendously atrocious at parsing malformed data
This commit is contained in:
parent
ec724eb64a
commit
bd1f6029a3
|
@ -27,6 +27,7 @@
|
|||
|
||||
|
||||
#pragma once
|
||||
#include <ctype.h>
|
||||
#include <boost/shared_ptr.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
@ -752,87 +753,107 @@ namespace net_utils
|
|||
return true;
|
||||
}
|
||||
//---------------------------------------------------------------------------
|
||||
inline
|
||||
bool parse_header(http_header_info& body_info, const std::string& m_cache_to_process)
|
||||
{
|
||||
MTRACE("http_stream_filter::parse_cached_header(*)");
|
||||
|
||||
STATIC_REGEXP_EXPR_1(rexp_mach_field,
|
||||
"\n?((Connection)|(Referer)|(Content-Length)|(Content-Type)|(Transfer-Encoding)|(Content-Encoding)|(Host)|(Cookie)|(User-Agent)|(Origin)"
|
||||
// 12 3 4 5 6 7 8 9 10 11
|
||||
"|([\\w-]+?)) ?: ?((.*?)(\r?\n))[^\t ]",
|
||||
//12 13 14 15
|
||||
boost::regex::icase | boost::regex::normal);
|
||||
|
||||
boost::smatch result;
|
||||
std::string::const_iterator it_current_bound = m_cache_to_process.begin();
|
||||
std::string::const_iterator it_end_bound = m_cache_to_process.end();
|
||||
|
||||
|
||||
|
||||
//lookup all fields and fill well-known fields
|
||||
while( boost::regex_search( it_current_bound, it_end_bound, result, rexp_mach_field, boost::match_default) && result[0].matched)
|
||||
{
|
||||
const size_t field_val = 14;
|
||||
//const size_t field_etc_name = 11;
|
||||
|
||||
int i = 2; //start position = 2
|
||||
if(result[i++].matched)//"Connection"
|
||||
body_info.m_connection = result[field_val];
|
||||
else if(result[i++].matched)//"Referrer"
|
||||
body_info.m_referer = result[field_val];
|
||||
else if(result[i++].matched)//"Content-Length"
|
||||
body_info.m_content_length = result[field_val];
|
||||
else if(result[i++].matched)//"Content-Type"
|
||||
body_info.m_content_type = result[field_val];
|
||||
else if(result[i++].matched)//"Transfer-Encoding"
|
||||
body_info.m_transfer_encoding = result[field_val];
|
||||
else if(result[i++].matched)//"Content-Encoding"
|
||||
body_info.m_content_encoding = result[field_val];
|
||||
else if(result[i++].matched)//"Host"
|
||||
{ body_info.m_host = result[field_val];
|
||||
string_tools::trim(body_info.m_host);
|
||||
}
|
||||
else if(result[i++].matched)//"Cookie"
|
||||
body_info.m_cookie = result[field_val];
|
||||
else if(result[i++].matched)//"User-Agent"
|
||||
body_info.m_user_agent = result[field_val];
|
||||
else if(result[i++].matched)//"Origin"
|
||||
body_info.m_origin = result[field_val];
|
||||
else if(result[i++].matched)//e.t.c (HAVE TO BE MATCHED!)
|
||||
body_info.m_etc_fields.emplace_back(result[12], result[field_val]);
|
||||
else
|
||||
{CHECK_AND_ASSERT_MES(false, false, "http_stream_filter::parse_cached_header() not matched last entry in:"<<m_cache_to_process);}
|
||||
|
||||
it_current_bound = result[(int)result.size()-1]. first;
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
inline
|
||||
bool analize_first_response_line()
|
||||
inline bool parse_header(http_header_info& body_info, const std::string& m_cache_to_process)
|
||||
{
|
||||
MTRACE("http_stream_filter::parse_cached_header(*)");
|
||||
|
||||
//First line response, look like this: "HTTP/1.1 200 OK"
|
||||
STATIC_REGEXP_EXPR_1(rexp_match_first_response_line, "^HTTP/(\\d+).(\\d+) ((\\d)\\d{2})( [^\n]*)?\r?\n", boost::regex::icase | boost::regex::normal);
|
||||
// 1 2 34 5
|
||||
//size_t match_len = 0;
|
||||
boost::smatch result;
|
||||
if(boost::regex_search( m_header_cache, result, rexp_match_first_response_line, boost::match_default) && result[0].matched)
|
||||
const char *ptr = m_cache_to_process.c_str();
|
||||
while (ptr[0] != '\r' || ptr[1] != '\n')
|
||||
{
|
||||
CHECK_AND_ASSERT_MES(result[1].matched&&result[2].matched, false, "http_stream_filter::handle_invoke_reply_line() assert failed...");
|
||||
m_response_info.m_http_ver_hi = boost::lexical_cast<int>(result[1]);
|
||||
m_response_info.m_http_ver_lo = boost::lexical_cast<int>(result[2]);
|
||||
m_response_info.m_response_code = boost::lexical_cast<int>(result[3]);
|
||||
|
||||
m_header_cache.erase(to_nonsonst_iterator(m_header_cache, result[0].first), to_nonsonst_iterator(m_header_cache, result[0].second));
|
||||
return true;
|
||||
}else
|
||||
{
|
||||
LOG_ERROR("http_stream_filter::handle_invoke_reply_line(): Failed to match first response line:" << m_header_cache);
|
||||
return false;
|
||||
// optional \n
|
||||
if (*ptr == '\n')
|
||||
++ptr;
|
||||
// an identifier composed of letters or -
|
||||
const char *key_pos = ptr;
|
||||
while (isalnum(*ptr) || *ptr == '_' || *ptr == '-')
|
||||
++ptr;
|
||||
const char *key_end = ptr;
|
||||
// optional space (not in RFC, but in previous code)
|
||||
if (*ptr == ' ')
|
||||
++ptr;
|
||||
CHECK_AND_ASSERT_MES(*ptr == ':', true, "http_stream_filter::parse_cached_header() invalid header in: " << m_cache_to_process);
|
||||
++ptr;
|
||||
// optional whitespace, but not newlines - line folding is obsolete, let's ignore it
|
||||
while (isblank(*ptr))
|
||||
++ptr;
|
||||
const char *value_pos = ptr;
|
||||
while (*ptr != '\r' && *ptr != '\n')
|
||||
++ptr;
|
||||
const char *value_end = ptr;
|
||||
// optional trailing whitespace
|
||||
while (value_end > value_pos && isblank(*(value_end-1)))
|
||||
--value_end;
|
||||
if (*ptr == '\r')
|
||||
++ptr;
|
||||
CHECK_AND_ASSERT_MES(*ptr == '\n', true, "http_stream_filter::parse_cached_header() invalid header in: " << m_cache_to_process);
|
||||
++ptr;
|
||||
|
||||
const std::string key = std::string(key_pos, key_end - key_pos);
|
||||
const std::string value = std::string(value_pos, value_end - value_pos);
|
||||
if (!key.empty())
|
||||
{
|
||||
if (!string_tools::compare_no_case(key, "Connection"))
|
||||
body_info.m_connection = value;
|
||||
else if(!string_tools::compare_no_case(key, "Referrer"))
|
||||
body_info.m_referer = value;
|
||||
else if(!string_tools::compare_no_case(key, "Content-Length"))
|
||||
body_info.m_content_length = value;
|
||||
else if(!string_tools::compare_no_case(key, "Content-Type"))
|
||||
body_info.m_content_type = value;
|
||||
else if(!string_tools::compare_no_case(key, "Transfer-Encoding"))
|
||||
body_info.m_transfer_encoding = value;
|
||||
else if(!string_tools::compare_no_case(key, "Content-Encoding"))
|
||||
body_info.m_content_encoding = value;
|
||||
else if(!string_tools::compare_no_case(key, "Host"))
|
||||
body_info.m_host = value;
|
||||
else if(!string_tools::compare_no_case(key, "Cookie"))
|
||||
body_info.m_cookie = value;
|
||||
else if(!string_tools::compare_no_case(key, "User-Agent"))
|
||||
body_info.m_user_agent = value;
|
||||
else if(!string_tools::compare_no_case(key, "Origin"))
|
||||
body_info.m_origin = value;
|
||||
else
|
||||
body_info.m_etc_fields.emplace_back(key, value);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
//---------------------------------------------------------------------------
|
||||
inline bool analize_first_response_line()
|
||||
{
|
||||
//First line response, look like this: "HTTP/1.1 200 OK"
|
||||
const char *ptr = m_header_cache.c_str();
|
||||
CHECK_AND_ASSERT_MES(!memcmp(ptr, "HTTP/", 5), false, "Invalid first response line: " + m_header_cache);
|
||||
ptr += 5;
|
||||
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache);
|
||||
unsigned long ul;
|
||||
char *end;
|
||||
ul = strtoul(ptr, &end, 10);
|
||||
CHECK_AND_ASSERT_MES(ul <= INT_MAX && *end =='.', false, "Invalid first response line: " + m_header_cache);
|
||||
m_response_info.m_http_ver_hi = ul;
|
||||
ptr = end + 1;
|
||||
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache + ", ptr: " << ptr);
|
||||
ul = strtoul(ptr, &end, 10);
|
||||
CHECK_AND_ASSERT_MES(ul <= INT_MAX && isblank(*end), false, "Invalid first response line: " + m_header_cache + ", ptr: " << ptr);
|
||||
m_response_info.m_http_ver_lo = ul;
|
||||
ptr = end + 1;
|
||||
while (isblank(*ptr))
|
||||
++ptr;
|
||||
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache);
|
||||
ul = strtoul(ptr, &end, 10);
|
||||
CHECK_AND_ASSERT_MES(ul >= 100 && ul <= 999 && isspace(*end), false, "Invalid first response line: " + m_header_cache);
|
||||
m_response_info.m_response_code = ul;
|
||||
ptr = end;
|
||||
// ignore the optional text, till the end
|
||||
while (*ptr != '\r' && *ptr != '\n')
|
||||
++ptr;
|
||||
if (*ptr == '\r')
|
||||
++ptr;
|
||||
CHECK_AND_ASSERT_MES(*ptr == '\n', false, "Invalid first response line: " << m_header_cache);
|
||||
++ptr;
|
||||
|
||||
m_header_cache.erase(0, ptr - m_header_cache.c_str());
|
||||
return true;
|
||||
}
|
||||
inline
|
||||
bool set_reply_content_encoder()
|
||||
|
|
Loading…
Reference in New Issue