Periodically send pings to detect dead Redis connections (#9218)
This is done by creating a custom `RedisFactory` subclass that periodically pings all connections in its pool. We also ensure that the `replyTimeout` param is non-null, so that we timeout waiting for the reply to those pings (and thus triggering a reconnect).
This commit is contained in:
parent
5b857b77f7
commit
a1ff1e967f
|
@ -0,0 +1 @@
|
||||||
|
Fix bug where we sometimes didn't detect that Redis connections had died, causing workers to not see new data.
|
|
@ -19,8 +19,9 @@ from typing import List, Optional, Type, Union
|
||||||
|
|
||||||
class RedisProtocol:
|
class RedisProtocol:
|
||||||
def publish(self, channel: str, message: bytes): ...
|
def publish(self, channel: str, message: bytes): ...
|
||||||
|
async def ping(self) -> None: ...
|
||||||
|
|
||||||
class SubscriberProtocol:
|
class SubscriberProtocol(RedisProtocol):
|
||||||
def __init__(self, *args, **kwargs): ...
|
def __init__(self, *args, **kwargs): ...
|
||||||
password: Optional[str]
|
password: Optional[str]
|
||||||
def subscribe(self, channels: Union[str, List[str]]): ...
|
def subscribe(self, channels: Union[str, List[str]]): ...
|
||||||
|
@ -39,14 +40,13 @@ def lazyConnection(
|
||||||
convertNumbers: bool = ...,
|
convertNumbers: bool = ...,
|
||||||
) -> RedisProtocol: ...
|
) -> RedisProtocol: ...
|
||||||
|
|
||||||
class SubscriberFactory:
|
|
||||||
def buildProtocol(self, addr): ...
|
|
||||||
|
|
||||||
class ConnectionHandler: ...
|
class ConnectionHandler: ...
|
||||||
|
|
||||||
class RedisFactory:
|
class RedisFactory:
|
||||||
continueTrying: bool
|
continueTrying: bool
|
||||||
handler: RedisProtocol
|
handler: RedisProtocol
|
||||||
|
pool: List[RedisProtocol]
|
||||||
|
replyTimeout: Optional[int]
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
uuid: str,
|
uuid: str,
|
||||||
|
@ -59,3 +59,7 @@ class RedisFactory:
|
||||||
replyTimeout: Optional[int] = None,
|
replyTimeout: Optional[int] = None,
|
||||||
convertNumbers: Optional[int] = True,
|
convertNumbers: Optional[int] = True,
|
||||||
): ...
|
): ...
|
||||||
|
def buildProtocol(self, addr) -> RedisProtocol: ...
|
||||||
|
|
||||||
|
class SubscriberFactory(RedisFactory):
|
||||||
|
def __init__(self): ...
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import logging
|
import logging
|
||||||
from typing import (
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
Any,
|
Any,
|
||||||
Awaitable,
|
Awaitable,
|
||||||
Dict,
|
Dict,
|
||||||
|
@ -63,6 +64,9 @@ from synapse.replication.tcp.streams import (
|
||||||
TypingStream,
|
TypingStream,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from synapse.server import HomeServer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -88,7 +92,7 @@ class ReplicationCommandHandler:
|
||||||
back out to connections.
|
back out to connections.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, hs):
|
def __init__(self, hs: "HomeServer"):
|
||||||
self._replication_data_handler = hs.get_replication_data_handler()
|
self._replication_data_handler = hs.get_replication_data_handler()
|
||||||
self._presence_handler = hs.get_presence_handler()
|
self._presence_handler = hs.get_presence_handler()
|
||||||
self._store = hs.get_datastore()
|
self._store = hs.get_datastore()
|
||||||
|
@ -300,7 +304,7 @@ class ReplicationCommandHandler:
|
||||||
|
|
||||||
# First create the connection for sending commands.
|
# First create the connection for sending commands.
|
||||||
outbound_redis_connection = lazyConnection(
|
outbound_redis_connection = lazyConnection(
|
||||||
reactor=hs.get_reactor(),
|
hs=hs,
|
||||||
host=hs.config.redis_host,
|
host=hs.config.redis_host,
|
||||||
port=hs.config.redis_port,
|
port=hs.config.redis_port,
|
||||||
password=hs.config.redis.redis_password,
|
password=hs.config.redis.redis_password,
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from inspect import isawaitable
|
from inspect import isawaitable
|
||||||
from typing import TYPE_CHECKING, Optional
|
from typing import TYPE_CHECKING, Optional, Type, cast
|
||||||
|
|
||||||
import txredisapi
|
import txredisapi
|
||||||
|
|
||||||
|
@ -23,6 +23,7 @@ from synapse.logging.context import PreserveLoggingContext, make_deferred_yielda
|
||||||
from synapse.metrics.background_process_metrics import (
|
from synapse.metrics.background_process_metrics import (
|
||||||
BackgroundProcessLoggingContext,
|
BackgroundProcessLoggingContext,
|
||||||
run_as_background_process,
|
run_as_background_process,
|
||||||
|
wrap_as_background_process,
|
||||||
)
|
)
|
||||||
from synapse.replication.tcp.commands import (
|
from synapse.replication.tcp.commands import (
|
||||||
Command,
|
Command,
|
||||||
|
@ -59,16 +60,16 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
|
||||||
immediately after initialisation.
|
immediately after initialisation.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
handler: The command handler to handle incoming commands.
|
synapse_handler: The command handler to handle incoming commands.
|
||||||
stream_name: The *redis* stream name to subscribe to and publish from
|
synapse_stream_name: The *redis* stream name to subscribe to and publish
|
||||||
(not anything to do with Synapse replication streams).
|
from (not anything to do with Synapse replication streams).
|
||||||
outbound_redis_connection: The connection to redis to use to send
|
synapse_outbound_redis_connection: The connection to redis to use to send
|
||||||
commands.
|
commands.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
handler = None # type: ReplicationCommandHandler
|
synapse_handler = None # type: ReplicationCommandHandler
|
||||||
stream_name = None # type: str
|
synapse_stream_name = None # type: str
|
||||||
outbound_redis_connection = None # type: txredisapi.RedisProtocol
|
synapse_outbound_redis_connection = None # type: txredisapi.RedisProtocol
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
@ -88,19 +89,19 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
|
||||||
# it's important to make sure that we only send the REPLICATE command once we
|
# it's important to make sure that we only send the REPLICATE command once we
|
||||||
# have successfully subscribed to the stream - otherwise we might miss the
|
# have successfully subscribed to the stream - otherwise we might miss the
|
||||||
# POSITION response sent back by the other end.
|
# POSITION response sent back by the other end.
|
||||||
logger.info("Sending redis SUBSCRIBE for %s", self.stream_name)
|
logger.info("Sending redis SUBSCRIBE for %s", self.synapse_stream_name)
|
||||||
await make_deferred_yieldable(self.subscribe(self.stream_name))
|
await make_deferred_yieldable(self.subscribe(self.synapse_stream_name))
|
||||||
logger.info(
|
logger.info(
|
||||||
"Successfully subscribed to redis stream, sending REPLICATE command"
|
"Successfully subscribed to redis stream, sending REPLICATE command"
|
||||||
)
|
)
|
||||||
self.handler.new_connection(self)
|
self.synapse_handler.new_connection(self)
|
||||||
await self._async_send_command(ReplicateCommand())
|
await self._async_send_command(ReplicateCommand())
|
||||||
logger.info("REPLICATE successfully sent")
|
logger.info("REPLICATE successfully sent")
|
||||||
|
|
||||||
# We send out our positions when there is a new connection in case the
|
# We send out our positions when there is a new connection in case the
|
||||||
# other side missed updates. We do this for Redis connections as the
|
# other side missed updates. We do this for Redis connections as the
|
||||||
# otherside won't know we've connected and so won't issue a REPLICATE.
|
# otherside won't know we've connected and so won't issue a REPLICATE.
|
||||||
self.handler.send_positions_to_connection(self)
|
self.synapse_handler.send_positions_to_connection(self)
|
||||||
|
|
||||||
def messageReceived(self, pattern: str, channel: str, message: str):
|
def messageReceived(self, pattern: str, channel: str, message: str):
|
||||||
"""Received a message from redis.
|
"""Received a message from redis.
|
||||||
|
@ -137,7 +138,7 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
|
||||||
cmd: received command
|
cmd: received command
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cmd_func = getattr(self.handler, "on_%s" % (cmd.NAME,), None)
|
cmd_func = getattr(self.synapse_handler, "on_%s" % (cmd.NAME,), None)
|
||||||
if not cmd_func:
|
if not cmd_func:
|
||||||
logger.warning("Unhandled command: %r", cmd)
|
logger.warning("Unhandled command: %r", cmd)
|
||||||
return
|
return
|
||||||
|
@ -155,7 +156,7 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
|
||||||
def connectionLost(self, reason):
|
def connectionLost(self, reason):
|
||||||
logger.info("Lost connection to redis")
|
logger.info("Lost connection to redis")
|
||||||
super().connectionLost(reason)
|
super().connectionLost(reason)
|
||||||
self.handler.lost_connection(self)
|
self.synapse_handler.lost_connection(self)
|
||||||
|
|
||||||
# mark the logging context as finished
|
# mark the logging context as finished
|
||||||
self._logging_context.__exit__(None, None, None)
|
self._logging_context.__exit__(None, None, None)
|
||||||
|
@ -183,11 +184,54 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
|
||||||
tcp_outbound_commands_counter.labels(cmd.NAME, "redis").inc()
|
tcp_outbound_commands_counter.labels(cmd.NAME, "redis").inc()
|
||||||
|
|
||||||
await make_deferred_yieldable(
|
await make_deferred_yieldable(
|
||||||
self.outbound_redis_connection.publish(self.stream_name, encoded_string)
|
self.synapse_outbound_redis_connection.publish(
|
||||||
|
self.synapse_stream_name, encoded_string
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class RedisDirectTcpReplicationClientFactory(txredisapi.SubscriberFactory):
|
class SynapseRedisFactory(txredisapi.RedisFactory):
|
||||||
|
"""A subclass of RedisFactory that periodically sends pings to ensure that
|
||||||
|
we detect dead connections.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hs: "HomeServer",
|
||||||
|
uuid: str,
|
||||||
|
dbid: Optional[int],
|
||||||
|
poolsize: int,
|
||||||
|
isLazy: bool = False,
|
||||||
|
handler: Type = txredisapi.ConnectionHandler,
|
||||||
|
charset: str = "utf-8",
|
||||||
|
password: Optional[str] = None,
|
||||||
|
replyTimeout: int = 30,
|
||||||
|
convertNumbers: Optional[int] = True,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
uuid=uuid,
|
||||||
|
dbid=dbid,
|
||||||
|
poolsize=poolsize,
|
||||||
|
isLazy=isLazy,
|
||||||
|
handler=handler,
|
||||||
|
charset=charset,
|
||||||
|
password=password,
|
||||||
|
replyTimeout=replyTimeout,
|
||||||
|
convertNumbers=convertNumbers,
|
||||||
|
)
|
||||||
|
|
||||||
|
hs.get_clock().looping_call(self._send_ping, 30 * 1000)
|
||||||
|
|
||||||
|
@wrap_as_background_process("redis_ping")
|
||||||
|
async def _send_ping(self):
|
||||||
|
for connection in self.pool:
|
||||||
|
try:
|
||||||
|
await make_deferred_yieldable(connection.ping())
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to send ping to a redis connection")
|
||||||
|
|
||||||
|
|
||||||
|
class RedisDirectTcpReplicationClientFactory(SynapseRedisFactory):
|
||||||
"""This is a reconnecting factory that connects to redis and immediately
|
"""This is a reconnecting factory that connects to redis and immediately
|
||||||
subscribes to a stream.
|
subscribes to a stream.
|
||||||
|
|
||||||
|
@ -206,65 +250,62 @@ class RedisDirectTcpReplicationClientFactory(txredisapi.SubscriberFactory):
|
||||||
self, hs: "HomeServer", outbound_redis_connection: txredisapi.RedisProtocol
|
self, hs: "HomeServer", outbound_redis_connection: txredisapi.RedisProtocol
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__(
|
||||||
|
hs,
|
||||||
|
uuid="subscriber",
|
||||||
|
dbid=None,
|
||||||
|
poolsize=1,
|
||||||
|
replyTimeout=30,
|
||||||
|
password=hs.config.redis.redis_password,
|
||||||
|
)
|
||||||
|
|
||||||
# This sets the password on the RedisFactory base class (as
|
self.synapse_handler = hs.get_tcp_replication()
|
||||||
# SubscriberFactory constructor doesn't pass it through).
|
self.synapse_stream_name = hs.hostname
|
||||||
self.password = hs.config.redis.redis_password
|
|
||||||
|
|
||||||
self.handler = hs.get_tcp_replication()
|
self.synapse_outbound_redis_connection = outbound_redis_connection
|
||||||
self.stream_name = hs.hostname
|
|
||||||
|
|
||||||
self.outbound_redis_connection = outbound_redis_connection
|
|
||||||
|
|
||||||
def buildProtocol(self, addr):
|
def buildProtocol(self, addr):
|
||||||
p = super().buildProtocol(addr) # type: RedisSubscriber
|
p = super().buildProtocol(addr)
|
||||||
|
p = cast(RedisSubscriber, p)
|
||||||
|
|
||||||
# We do this here rather than add to the constructor of `RedisSubcriber`
|
# We do this here rather than add to the constructor of `RedisSubcriber`
|
||||||
# as to do so would involve overriding `buildProtocol` entirely, however
|
# as to do so would involve overriding `buildProtocol` entirely, however
|
||||||
# the base method does some other things than just instantiating the
|
# the base method does some other things than just instantiating the
|
||||||
# protocol.
|
# protocol.
|
||||||
p.handler = self.handler
|
p.synapse_handler = self.synapse_handler
|
||||||
p.outbound_redis_connection = self.outbound_redis_connection
|
p.synapse_outbound_redis_connection = self.synapse_outbound_redis_connection
|
||||||
p.stream_name = self.stream_name
|
p.synapse_stream_name = self.synapse_stream_name
|
||||||
p.password = self.password
|
|
||||||
|
|
||||||
return p
|
return p
|
||||||
|
|
||||||
|
|
||||||
def lazyConnection(
|
def lazyConnection(
|
||||||
reactor,
|
hs: "HomeServer",
|
||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
port: int = 6379,
|
port: int = 6379,
|
||||||
dbid: Optional[int] = None,
|
dbid: Optional[int] = None,
|
||||||
reconnect: bool = True,
|
reconnect: bool = True,
|
||||||
charset: str = "utf-8",
|
|
||||||
password: Optional[str] = None,
|
password: Optional[str] = None,
|
||||||
connectTimeout: Optional[int] = None,
|
replyTimeout: int = 30,
|
||||||
replyTimeout: Optional[int] = None,
|
|
||||||
convertNumbers: bool = True,
|
|
||||||
) -> txredisapi.RedisProtocol:
|
) -> txredisapi.RedisProtocol:
|
||||||
"""Equivalent to `txredisapi.lazyConnection`, except allows specifying a
|
"""Creates a connection to Redis that is lazily set up and reconnects if the
|
||||||
reactor.
|
connections is lost.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
isLazy = True
|
|
||||||
poolsize = 1
|
|
||||||
|
|
||||||
uuid = "%s:%d" % (host, port)
|
uuid = "%s:%d" % (host, port)
|
||||||
factory = txredisapi.RedisFactory(
|
factory = SynapseRedisFactory(
|
||||||
uuid,
|
hs,
|
||||||
dbid,
|
uuid=uuid,
|
||||||
poolsize,
|
dbid=dbid,
|
||||||
isLazy,
|
poolsize=1,
|
||||||
txredisapi.ConnectionHandler,
|
isLazy=True,
|
||||||
charset,
|
handler=txredisapi.ConnectionHandler,
|
||||||
password,
|
password=password,
|
||||||
replyTimeout,
|
replyTimeout=replyTimeout,
|
||||||
convertNumbers,
|
|
||||||
)
|
)
|
||||||
factory.continueTrying = reconnect
|
factory.continueTrying = reconnect
|
||||||
for x in range(poolsize):
|
|
||||||
reactor.connectTCP(host, port, factory, connectTimeout)
|
reactor = hs.get_reactor()
|
||||||
|
reactor.connectTCP(host, port, factory, 30)
|
||||||
|
|
||||||
return factory.handler
|
return factory.handler
|
||||||
|
|
Loading…
Reference in New Issue