diff --git a/changelog.d/4358.misc b/changelog.d/4358.misc new file mode 100644 index 0000000000..020dacb547 --- /dev/null +++ b/changelog.d/4358.misc @@ -0,0 +1 @@ +Add better logging for unexpected errors while sending transactions diff --git a/synapse/api/errors.py b/synapse/api/errors.py index 48b903374d..0b464834ce 100644 --- a/synapse/api/errors.py +++ b/synapse/api/errors.py @@ -348,6 +348,24 @@ class IncompatibleRoomVersionError(SynapseError): ) +class RequestSendFailed(RuntimeError): + """Sending a HTTP request over federation failed due to not being able to + talk to the remote server for some reason. + + This exception is used to differentiate "expected" errors that arise due to + networking (e.g. DNS failures, connection timeouts etc), versus unexpected + errors (like programming errors). + """ + def __init__(self, inner_exception, can_retry): + super(RequestSendFailed, self).__init__( + "Failed to send request: %s: %s" % ( + type(inner_exception).__name__, inner_exception, + ) + ) + self.inner_exception = inner_exception + self.can_retry = can_retry + + def cs_error(msg, code=Codes.UNKNOWN, **kwargs): """ Utility method for constructing an error response for client-server interactions. diff --git a/synapse/federation/transaction_queue.py b/synapse/federation/transaction_queue.py index 099ace28c1..4640513497 100644 --- a/synapse/federation/transaction_queue.py +++ b/synapse/federation/transaction_queue.py @@ -22,7 +22,11 @@ from prometheus_client import Counter from twisted.internet import defer import synapse.metrics -from synapse.api.errors import FederationDeniedError, HttpResponseException +from synapse.api.errors import ( + FederationDeniedError, + HttpResponseException, + RequestSendFailed, +) from synapse.handlers.presence import format_user_presence_state, get_interested_remotes from synapse.metrics import ( LaterGauge, @@ -518,11 +522,16 @@ class TransactionQueue(object): ) except FederationDeniedError as e: logger.info(e) - except Exception as e: - logger.warn( - "TX [%s] Failed to send transaction: %s", + except RequestSendFailed as e: + logger.warning("(TX [%s] Failed to send transaction: %s", destination, e) + + for p, _ in pending_pdus: + logger.info("Failed to send event %s to %s", p.event_id, + destination) + except Exception: + logger.exception( + "TX [%s] Failed to send transaction", destination, - e, ) for p, _ in pending_pdus: logger.info("Failed to send event %s to %s", p.event_id, diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index 24b6110c20..7a2b4f0957 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -19,7 +19,7 @@ import random import sys from io import BytesIO -from six import PY3, string_types +from six import PY3, raise_from, string_types from six.moves import urllib import attr @@ -41,6 +41,7 @@ from synapse.api.errors import ( Codes, FederationDeniedError, HttpResponseException, + RequestSendFailed, SynapseError, ) from synapse.http.endpoint import matrix_federation_endpoint @@ -231,7 +232,7 @@ class MatrixFederationHttpClient(object): Deferred: resolves with the http response object on success. Fails with ``HttpResponseException``: if we get an HTTP response - code >= 300. + code >= 300 (except 429). Fails with ``NotRetryingDestination`` if we are not yet ready to retry this server. @@ -239,8 +240,8 @@ class MatrixFederationHttpClient(object): Fails with ``FederationDeniedError`` if this destination is not on our federation whitelist - (May also fail with plenty of other Exceptions for things like DNS - failures, connection failures, SSL failures.) + Fails with ``RequestSendFailed`` if there were problems connecting to + the remote, due to e.g. DNS failures, connection timeouts etc. """ if timeout: _sec_timeout = timeout / 1000 @@ -335,23 +336,74 @@ class MatrixFederationHttpClient(object): reactor=self.hs.get_reactor(), ) - with Measure(self.clock, "outbound_request"): - response = yield make_deferred_yieldable( - request_deferred, + try: + with Measure(self.clock, "outbound_request"): + response = yield make_deferred_yieldable( + request_deferred, + ) + except DNSLookupError as e: + raise_from(RequestSendFailed(e, can_retry=retry_on_dns_fail), e) + except Exception as e: + raise_from(RequestSendFailed(e, can_retry=True), e) + + logger.info( + "{%s} [%s] Got response headers: %d %s", + request.txn_id, + request.destination, + response.code, + response.phrase.decode('ascii', errors='replace'), + ) + + if 200 <= response.code < 300: + pass + else: + # :'( + # Update transactions table? + d = treq.content(response) + d = timeout_deferred( + d, + timeout=_sec_timeout, + reactor=self.hs.get_reactor(), ) + try: + body = yield make_deferred_yieldable(d) + except Exception as e: + # Eh, we're already going to raise an exception so lets + # ignore if this fails. + logger.warn( + "{%s} [%s] Failed to get error response: %s %s: %s", + request.txn_id, + request.destination, + request.method, + url_str, + _flatten_response_never_received(e), + ) + body = None + + e = HttpResponseException( + response.code, response.phrase, body + ) + + # Retry if the error is a 429 (Too Many Requests), + # otherwise just raise a standard HttpResponseException + if response.code == 429: + raise_from(RequestSendFailed(e, can_retry=True), e) + else: + raise e + break - except Exception as e: + except RequestSendFailed as e: logger.warn( "{%s} [%s] Request failed: %s %s: %s", request.txn_id, request.destination, request.method, url_str, - _flatten_response_never_received(e), + _flatten_response_never_received(e.inner_exception), ) - if not retry_on_dns_fail and isinstance(e, DNSLookupError): + if not e.can_retry: raise if retries_left and not timeout: @@ -376,29 +428,16 @@ class MatrixFederationHttpClient(object): else: raise - logger.info( - "{%s} [%s] Got response headers: %d %s", - request.txn_id, - request.destination, - response.code, - response.phrase.decode('ascii', errors='replace'), - ) - - if 200 <= response.code < 300: - pass - else: - # :'( - # Update transactions table? - d = treq.content(response) - d = timeout_deferred( - d, - timeout=_sec_timeout, - reactor=self.hs.get_reactor(), - ) - body = yield make_deferred_yieldable(d) - raise HttpResponseException( - response.code, response.phrase, body - ) + except Exception as e: + logger.warn( + "{%s} [%s] Request failed: %s %s: %s", + request.txn_id, + request.destination, + request.method, + url_str, + _flatten_response_never_received(e), + ) + raise defer.returnValue(response) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index e117836e9a..bdffa97805 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -30,6 +30,7 @@ from synapse.api.errors import ( FederationDeniedError, HttpResponseException, NotFoundError, + RequestSendFailed, SynapseError, ) from synapse.metrics.background_process_metrics import run_as_background_process @@ -372,10 +373,10 @@ class MediaRepository(object): "allow_remote": "false", } ) - except twisted.internet.error.DNSLookupError as e: - logger.warn("HTTP error fetching remote media %s/%s: %r", + except RequestSendFailed as e: + logger.warn("Request failed fetching remote media %s/%s: %r", server_name, media_id, e) - raise NotFoundError() + raise SynapseError(502, "Failed to fetch remote media") except HttpResponseException as e: logger.warn("HTTP error fetching remote media %s/%s: %s", diff --git a/tests/http/test_fedclient.py b/tests/http/test_fedclient.py index f3cb1423f0..b2e38276d8 100644 --- a/tests/http/test_fedclient.py +++ b/tests/http/test_fedclient.py @@ -20,6 +20,7 @@ from twisted.internet.error import ConnectingCancelledError, DNSLookupError from twisted.web.client import ResponseNeverReceived from twisted.web.http import HTTPChannel +from synapse.api.errors import RequestSendFailed from synapse.http.matrixfederationclient import ( MatrixFederationHttpClient, MatrixFederationRequest, @@ -49,7 +50,8 @@ class FederationClientTests(HomeserverTestCase): self.pump() f = self.failureResultOf(d) - self.assertIsInstance(f.value, DNSLookupError) + self.assertIsInstance(f.value, RequestSendFailed) + self.assertIsInstance(f.value.inner_exception, DNSLookupError) def test_client_never_connect(self): """ @@ -76,7 +78,11 @@ class FederationClientTests(HomeserverTestCase): self.reactor.advance(10.5) f = self.failureResultOf(d) - self.assertIsInstance(f.value, (ConnectingCancelledError, TimeoutError)) + self.assertIsInstance(f.value, RequestSendFailed) + self.assertIsInstance( + f.value.inner_exception, + (ConnectingCancelledError, TimeoutError), + ) def test_client_connect_no_response(self): """ @@ -107,7 +113,8 @@ class FederationClientTests(HomeserverTestCase): self.reactor.advance(10.5) f = self.failureResultOf(d) - self.assertIsInstance(f.value, ResponseNeverReceived) + self.assertIsInstance(f.value, RequestSendFailed) + self.assertIsInstance(f.value.inner_exception, ResponseNeverReceived) def test_client_gets_headers(self): """