Reject concurrent transactions (#9597)
If more transactions arrive from an origin while we're still processing the first one, reject them. Hopefully a quick fix to https://github.com/matrix-org/synapse/issues/9489
This commit is contained in:
parent
2b328d7e02
commit
1e67bff833
|
@ -0,0 +1 @@
|
||||||
|
Fix a bug introduced in Synapse 1.20 which caused incoming federation transactions to stack up, causing slow recovery from outages.
|
|
@ -112,10 +112,11 @@ class FederationServer(FederationBase):
|
||||||
# with FederationHandlerRegistry.
|
# with FederationHandlerRegistry.
|
||||||
hs.get_directory_handler()
|
hs.get_directory_handler()
|
||||||
|
|
||||||
self._federation_ratelimiter = hs.get_federation_ratelimiter()
|
|
||||||
|
|
||||||
self._server_linearizer = Linearizer("fed_server")
|
self._server_linearizer = Linearizer("fed_server")
|
||||||
self._transaction_linearizer = Linearizer("fed_txn_handler")
|
|
||||||
|
# origins that we are currently processing a transaction from.
|
||||||
|
# a dict from origin to txn id.
|
||||||
|
self._active_transactions = {} # type: Dict[str, str]
|
||||||
|
|
||||||
# We cache results for transaction with the same ID
|
# We cache results for transaction with the same ID
|
||||||
self._transaction_resp_cache = ResponseCache(
|
self._transaction_resp_cache = ResponseCache(
|
||||||
|
@ -169,6 +170,33 @@ class FederationServer(FederationBase):
|
||||||
|
|
||||||
logger.debug("[%s] Got transaction", transaction_id)
|
logger.debug("[%s] Got transaction", transaction_id)
|
||||||
|
|
||||||
|
# Reject malformed transactions early: reject if too many PDUs/EDUs
|
||||||
|
if len(transaction.pdus) > 50 or ( # type: ignore
|
||||||
|
hasattr(transaction, "edus") and len(transaction.edus) > 100 # type: ignore
|
||||||
|
):
|
||||||
|
logger.info("Transaction PDU or EDU count too large. Returning 400")
|
||||||
|
return 400, {}
|
||||||
|
|
||||||
|
# we only process one transaction from each origin at a time. We need to do
|
||||||
|
# this check here, rather than in _on_incoming_transaction_inner so that we
|
||||||
|
# don't cache the rejection in _transaction_resp_cache (so that if the txn
|
||||||
|
# arrives again later, we can process it).
|
||||||
|
current_transaction = self._active_transactions.get(origin)
|
||||||
|
if current_transaction and current_transaction != transaction_id:
|
||||||
|
logger.warning(
|
||||||
|
"Received another txn %s from %s while still processing %s",
|
||||||
|
transaction_id,
|
||||||
|
origin,
|
||||||
|
current_transaction,
|
||||||
|
)
|
||||||
|
return 429, {
|
||||||
|
"errcode": Codes.UNKNOWN,
|
||||||
|
"error": "Too many concurrent transactions",
|
||||||
|
}
|
||||||
|
|
||||||
|
# CRITICAL SECTION: we must now not await until we populate _active_transactions
|
||||||
|
# in _on_incoming_transaction_inner.
|
||||||
|
|
||||||
# We wrap in a ResponseCache so that we de-duplicate retried
|
# We wrap in a ResponseCache so that we de-duplicate retried
|
||||||
# transactions.
|
# transactions.
|
||||||
return await self._transaction_resp_cache.wrap(
|
return await self._transaction_resp_cache.wrap(
|
||||||
|
@ -182,26 +210,18 @@ class FederationServer(FederationBase):
|
||||||
async def _on_incoming_transaction_inner(
|
async def _on_incoming_transaction_inner(
|
||||||
self, origin: str, transaction: Transaction, request_time: int
|
self, origin: str, transaction: Transaction, request_time: int
|
||||||
) -> Tuple[int, Dict[str, Any]]:
|
) -> Tuple[int, Dict[str, Any]]:
|
||||||
# Use a linearizer to ensure that transactions from a remote are
|
# CRITICAL SECTION: the first thing we must do (before awaiting) is
|
||||||
# processed in order.
|
# add an entry to _active_transactions.
|
||||||
with await self._transaction_linearizer.queue(origin):
|
assert origin not in self._active_transactions
|
||||||
# We rate limit here *after* we've queued up the incoming requests,
|
self._active_transactions[origin] = transaction.transaction_id # type: ignore
|
||||||
# so that we don't fill up the ratelimiter with blocked requests.
|
|
||||||
#
|
|
||||||
# This is important as the ratelimiter allows N concurrent requests
|
|
||||||
# at a time, and only starts ratelimiting if there are more requests
|
|
||||||
# than that being processed at a time. If we queued up requests in
|
|
||||||
# the linearizer/response cache *after* the ratelimiting then those
|
|
||||||
# queued up requests would count as part of the allowed limit of N
|
|
||||||
# concurrent requests.
|
|
||||||
with self._federation_ratelimiter.ratelimit(origin) as d:
|
|
||||||
await d
|
|
||||||
|
|
||||||
|
try:
|
||||||
result = await self._handle_incoming_transaction(
|
result = await self._handle_incoming_transaction(
|
||||||
origin, transaction, request_time
|
origin, transaction, request_time
|
||||||
)
|
)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
finally:
|
||||||
|
del self._active_transactions[origin]
|
||||||
|
|
||||||
async def _handle_incoming_transaction(
|
async def _handle_incoming_transaction(
|
||||||
self, origin: str, transaction: Transaction, request_time: int
|
self, origin: str, transaction: Transaction, request_time: int
|
||||||
|
@ -227,19 +247,6 @@ class FederationServer(FederationBase):
|
||||||
|
|
||||||
logger.debug("[%s] Transaction is new", transaction.transaction_id) # type: ignore
|
logger.debug("[%s] Transaction is new", transaction.transaction_id) # type: ignore
|
||||||
|
|
||||||
# Reject if PDU count > 50 or EDU count > 100
|
|
||||||
if len(transaction.pdus) > 50 or ( # type: ignore
|
|
||||||
hasattr(transaction, "edus") and len(transaction.edus) > 100 # type: ignore
|
|
||||||
):
|
|
||||||
|
|
||||||
logger.info("Transaction PDU or EDU count too large. Returning 400")
|
|
||||||
|
|
||||||
response = {}
|
|
||||||
await self.transaction_actions.set_response(
|
|
||||||
origin, transaction, 400, response
|
|
||||||
)
|
|
||||||
return 400, response
|
|
||||||
|
|
||||||
# We process PDUs and EDUs in parallel. This is important as we don't
|
# We process PDUs and EDUs in parallel. This is important as we don't
|
||||||
# want to block things like to device messages from reaching clients
|
# want to block things like to device messages from reaching clients
|
||||||
# behind the potentially expensive handling of PDUs.
|
# behind the potentially expensive handling of PDUs.
|
||||||
|
|
Loading…
Reference in New Issue