http_client: rewrite header parsing manually for speed

boost::regex is stupendously atrocious at parsing malformed data
This commit is contained in:
moneromooo-monero 2017-12-12 13:44:11 +00:00
parent ec724eb64a
commit bd1f6029a3
No known key found for this signature in database
GPG Key ID: 686F07454D6CEFC3
1 changed files with 97 additions and 76 deletions

View File

@ -27,6 +27,7 @@
#pragma once #pragma once
#include <ctype.h>
#include <boost/shared_ptr.hpp> #include <boost/shared_ptr.hpp>
#include <boost/regex.hpp> #include <boost/regex.hpp>
#include <boost/lexical_cast.hpp> #include <boost/lexical_cast.hpp>
@ -752,87 +753,107 @@ namespace net_utils
return true; return true;
} }
//--------------------------------------------------------------------------- //---------------------------------------------------------------------------
inline inline bool parse_header(http_header_info& body_info, const std::string& m_cache_to_process)
bool parse_header(http_header_info& body_info, const std::string& m_cache_to_process)
{ {
MTRACE("http_stream_filter::parse_cached_header(*)"); MTRACE("http_stream_filter::parse_cached_header(*)");
STATIC_REGEXP_EXPR_1(rexp_mach_field, const char *ptr = m_cache_to_process.c_str();
"\n?((Connection)|(Referer)|(Content-Length)|(Content-Type)|(Transfer-Encoding)|(Content-Encoding)|(Host)|(Cookie)|(User-Agent)|(Origin)" while (ptr[0] != '\r' || ptr[1] != '\n')
// 12 3 4 5 6 7 8 9 10 11
"|([\\w-]+?)) ?: ?((.*?)(\r?\n))[^\t ]",
//12 13 14 15
boost::regex::icase | boost::regex::normal);
boost::smatch result;
std::string::const_iterator it_current_bound = m_cache_to_process.begin();
std::string::const_iterator it_end_bound = m_cache_to_process.end();
//lookup all fields and fill well-known fields
while( boost::regex_search( it_current_bound, it_end_bound, result, rexp_mach_field, boost::match_default) && result[0].matched)
{ {
const size_t field_val = 14; // optional \n
//const size_t field_etc_name = 11; if (*ptr == '\n')
++ptr;
// an identifier composed of letters or -
const char *key_pos = ptr;
while (isalnum(*ptr) || *ptr == '_' || *ptr == '-')
++ptr;
const char *key_end = ptr;
// optional space (not in RFC, but in previous code)
if (*ptr == ' ')
++ptr;
CHECK_AND_ASSERT_MES(*ptr == ':', true, "http_stream_filter::parse_cached_header() invalid header in: " << m_cache_to_process);
++ptr;
// optional whitespace, but not newlines - line folding is obsolete, let's ignore it
while (isblank(*ptr))
++ptr;
const char *value_pos = ptr;
while (*ptr != '\r' && *ptr != '\n')
++ptr;
const char *value_end = ptr;
// optional trailing whitespace
while (value_end > value_pos && isblank(*(value_end-1)))
--value_end;
if (*ptr == '\r')
++ptr;
CHECK_AND_ASSERT_MES(*ptr == '\n', true, "http_stream_filter::parse_cached_header() invalid header in: " << m_cache_to_process);
++ptr;
int i = 2; //start position = 2 const std::string key = std::string(key_pos, key_end - key_pos);
if(result[i++].matched)//"Connection" const std::string value = std::string(value_pos, value_end - value_pos);
body_info.m_connection = result[field_val]; if (!key.empty())
else if(result[i++].matched)//"Referrer" {
body_info.m_referer = result[field_val]; if (!string_tools::compare_no_case(key, "Connection"))
else if(result[i++].matched)//"Content-Length" body_info.m_connection = value;
body_info.m_content_length = result[field_val]; else if(!string_tools::compare_no_case(key, "Referrer"))
else if(result[i++].matched)//"Content-Type" body_info.m_referer = value;
body_info.m_content_type = result[field_val]; else if(!string_tools::compare_no_case(key, "Content-Length"))
else if(result[i++].matched)//"Transfer-Encoding" body_info.m_content_length = value;
body_info.m_transfer_encoding = result[field_val]; else if(!string_tools::compare_no_case(key, "Content-Type"))
else if(result[i++].matched)//"Content-Encoding" body_info.m_content_type = value;
body_info.m_content_encoding = result[field_val]; else if(!string_tools::compare_no_case(key, "Transfer-Encoding"))
else if(result[i++].matched)//"Host" body_info.m_transfer_encoding = value;
{ body_info.m_host = result[field_val]; else if(!string_tools::compare_no_case(key, "Content-Encoding"))
string_tools::trim(body_info.m_host); body_info.m_content_encoding = value;
} else if(!string_tools::compare_no_case(key, "Host"))
else if(result[i++].matched)//"Cookie" body_info.m_host = value;
body_info.m_cookie = result[field_val]; else if(!string_tools::compare_no_case(key, "Cookie"))
else if(result[i++].matched)//"User-Agent" body_info.m_cookie = value;
body_info.m_user_agent = result[field_val]; else if(!string_tools::compare_no_case(key, "User-Agent"))
else if(result[i++].matched)//"Origin" body_info.m_user_agent = value;
body_info.m_origin = result[field_val]; else if(!string_tools::compare_no_case(key, "Origin"))
else if(result[i++].matched)//e.t.c (HAVE TO BE MATCHED!) body_info.m_origin = value;
body_info.m_etc_fields.emplace_back(result[12], result[field_val]);
else else
{CHECK_AND_ASSERT_MES(false, false, "http_stream_filter::parse_cached_header() not matched last entry in:"<<m_cache_to_process);} body_info.m_etc_fields.emplace_back(key, value);
}
it_current_bound = result[(int)result.size()-1]. first;
} }
return true; return true;
} }
inline //---------------------------------------------------------------------------
bool analize_first_response_line() inline bool analize_first_response_line()
{ {
//First line response, look like this: "HTTP/1.1 200 OK" //First line response, look like this: "HTTP/1.1 200 OK"
STATIC_REGEXP_EXPR_1(rexp_match_first_response_line, "^HTTP/(\\d+).(\\d+) ((\\d)\\d{2})( [^\n]*)?\r?\n", boost::regex::icase | boost::regex::normal); const char *ptr = m_header_cache.c_str();
// 1 2 34 5 CHECK_AND_ASSERT_MES(!memcmp(ptr, "HTTP/", 5), false, "Invalid first response line: " + m_header_cache);
//size_t match_len = 0; ptr += 5;
boost::smatch result; CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache);
if(boost::regex_search( m_header_cache, result, rexp_match_first_response_line, boost::match_default) && result[0].matched) unsigned long ul;
{ char *end;
CHECK_AND_ASSERT_MES(result[1].matched&&result[2].matched, false, "http_stream_filter::handle_invoke_reply_line() assert failed..."); ul = strtoul(ptr, &end, 10);
m_response_info.m_http_ver_hi = boost::lexical_cast<int>(result[1]); CHECK_AND_ASSERT_MES(ul <= INT_MAX && *end =='.', false, "Invalid first response line: " + m_header_cache);
m_response_info.m_http_ver_lo = boost::lexical_cast<int>(result[2]); m_response_info.m_http_ver_hi = ul;
m_response_info.m_response_code = boost::lexical_cast<int>(result[3]); ptr = end + 1;
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache + ", ptr: " << ptr);
ul = strtoul(ptr, &end, 10);
CHECK_AND_ASSERT_MES(ul <= INT_MAX && isblank(*end), false, "Invalid first response line: " + m_header_cache + ", ptr: " << ptr);
m_response_info.m_http_ver_lo = ul;
ptr = end + 1;
while (isblank(*ptr))
++ptr;
CHECK_AND_ASSERT_MES(isdigit(*ptr), false, "Invalid first response line: " + m_header_cache);
ul = strtoul(ptr, &end, 10);
CHECK_AND_ASSERT_MES(ul >= 100 && ul <= 999 && isspace(*end), false, "Invalid first response line: " + m_header_cache);
m_response_info.m_response_code = ul;
ptr = end;
// ignore the optional text, till the end
while (*ptr != '\r' && *ptr != '\n')
++ptr;
if (*ptr == '\r')
++ptr;
CHECK_AND_ASSERT_MES(*ptr == '\n', false, "Invalid first response line: " << m_header_cache);
++ptr;
m_header_cache.erase(to_nonsonst_iterator(m_header_cache, result[0].first), to_nonsonst_iterator(m_header_cache, result[0].second)); m_header_cache.erase(0, ptr - m_header_cache.c_str());
return true; return true;
}else
{
LOG_ERROR("http_stream_filter::handle_invoke_reply_line(): Failed to match first response line:" << m_header_cache);
return false;
}
} }
inline inline
bool set_reply_content_encoder() bool set_reply_content_encoder()