Merge pull request #340 from matrix-org/erikj/server_retries
Retry dead servers a lot less often
This commit is contained in:
commit
5bc690408d
|
@ -35,6 +35,7 @@ from signedjson.sign import sign_json
|
||||||
|
|
||||||
import simplejson as json
|
import simplejson as json
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
import sys
|
import sys
|
||||||
import urllib
|
import urllib
|
||||||
import urlparse
|
import urlparse
|
||||||
|
@ -55,6 +56,9 @@ incoming_responses_counter = metrics.register_counter(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
MAX_RETRIES = 4
|
||||||
|
|
||||||
|
|
||||||
class MatrixFederationEndpointFactory(object):
|
class MatrixFederationEndpointFactory(object):
|
||||||
def __init__(self, hs):
|
def __init__(self, hs):
|
||||||
self.tls_server_context_factory = hs.tls_server_context_factory
|
self.tls_server_context_factory = hs.tls_server_context_factory
|
||||||
|
@ -119,7 +123,7 @@ class MatrixFederationHttpClient(object):
|
||||||
|
|
||||||
# XXX: Would be much nicer to retry only at the transaction-layer
|
# XXX: Would be much nicer to retry only at the transaction-layer
|
||||||
# (once we have reliable transactions in place)
|
# (once we have reliable transactions in place)
|
||||||
retries_left = 5
|
retries_left = MAX_RETRIES
|
||||||
|
|
||||||
http_url_bytes = urlparse.urlunparse(
|
http_url_bytes = urlparse.urlunparse(
|
||||||
("", "", path_bytes, param_bytes, query_bytes, "")
|
("", "", path_bytes, param_bytes, query_bytes, "")
|
||||||
|
@ -180,7 +184,9 @@ class MatrixFederationHttpClient(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
if retries_left and not timeout:
|
if retries_left and not timeout:
|
||||||
yield sleep(2 ** (5 - retries_left))
|
delay = 5 ** (MAX_RETRIES + 1 - retries_left)
|
||||||
|
delay *= random.uniform(0.8, 1.4)
|
||||||
|
yield sleep(delay)
|
||||||
retries_left -= 1
|
retries_left -= 1
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
|
@ -18,6 +18,7 @@ from twisted.internet import defer
|
||||||
from synapse.api.errors import CodeMessageException
|
from synapse.api.errors import CodeMessageException
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -85,8 +86,9 @@ def get_retry_limiter(destination, clock, store, **kwargs):
|
||||||
|
|
||||||
class RetryDestinationLimiter(object):
|
class RetryDestinationLimiter(object):
|
||||||
def __init__(self, destination, clock, store, retry_interval,
|
def __init__(self, destination, clock, store, retry_interval,
|
||||||
min_retry_interval=5000, max_retry_interval=60 * 60 * 1000,
|
min_retry_interval=10 * 60 * 1000,
|
||||||
multiplier_retry_interval=2,):
|
max_retry_interval=24 * 60 * 60 * 1000,
|
||||||
|
multiplier_retry_interval=5,):
|
||||||
"""Marks the destination as "down" if an exception is thrown in the
|
"""Marks the destination as "down" if an exception is thrown in the
|
||||||
context, except for CodeMessageException with code < 500.
|
context, except for CodeMessageException with code < 500.
|
||||||
|
|
||||||
|
@ -140,6 +142,7 @@ class RetryDestinationLimiter(object):
|
||||||
# We couldn't connect.
|
# We couldn't connect.
|
||||||
if self.retry_interval:
|
if self.retry_interval:
|
||||||
self.retry_interval *= self.multiplier_retry_interval
|
self.retry_interval *= self.multiplier_retry_interval
|
||||||
|
self.retry_interval *= int(random.uniform(0.8, 1.4))
|
||||||
|
|
||||||
if self.retry_interval >= self.max_retry_interval:
|
if self.retry_interval >= self.max_retry_interval:
|
||||||
self.retry_interval = self.max_retry_interval
|
self.retry_interval = self.max_retry_interval
|
||||||
|
|
Loading…
Reference in New Issue