Fix rate limit metrics registering twice and misreporting (#13649)
* Fix rate limit metrics registering twice and misreporting Fix https://github.com/matrix-org/synapse/issues/13641 * Fix lints * Add changelog * Document `metrics_name=None`.
This commit is contained in:
parent
ea85a2bf6c
commit
1eea73b413
|
@ -0,0 +1 @@
|
|||
Fix rate limit gauge metrics registering twice and misreporting (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`).
|
|
@ -756,7 +756,9 @@ class HomeServer(metaclass=abc.ABCMeta):
|
|||
@cache_in_self
|
||||
def get_federation_ratelimiter(self) -> FederationRateLimiter:
|
||||
return FederationRateLimiter(
|
||||
self.get_clock(), config=self.config.ratelimiting.rc_federation
|
||||
self.get_clock(),
|
||||
config=self.config.ratelimiting.rc_federation,
|
||||
metrics_name="federation_servlets",
|
||||
)
|
||||
|
||||
@cache_in_self
|
||||
|
|
|
@ -15,10 +15,23 @@
|
|||
import collections
|
||||
import contextlib
|
||||
import logging
|
||||
import threading
|
||||
import typing
|
||||
from typing import Any, DefaultDict, Iterator, List, Set
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
DefaultDict,
|
||||
Dict,
|
||||
Iterator,
|
||||
List,
|
||||
Mapping,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from prometheus_client.core import Counter
|
||||
from typing_extensions import ContextManager
|
||||
|
||||
from twisted.internet import defer
|
||||
|
||||
|
@ -40,12 +53,20 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
# Track how much the ratelimiter is affecting requests
|
||||
rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "")
|
||||
rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "")
|
||||
rate_limit_sleep_counter = Counter(
|
||||
"synapse_rate_limit_sleep",
|
||||
"Number of requests slept by the rate limiter",
|
||||
["rate_limiter_name"],
|
||||
)
|
||||
rate_limit_reject_counter = Counter(
|
||||
"synapse_rate_limit_reject",
|
||||
"Number of requests rejected by the rate limiter",
|
||||
["rate_limiter_name"],
|
||||
)
|
||||
queue_wait_timer = Histogram(
|
||||
"synapse_rate_limit_queue_wait_time_seconds",
|
||||
"sec",
|
||||
[],
|
||||
"Amount of time spent waiting for the rate limiter to let our request through.",
|
||||
["rate_limiter_name"],
|
||||
buckets=(
|
||||
0.005,
|
||||
0.01,
|
||||
|
@ -65,35 +86,92 @@ queue_wait_timer = Histogram(
|
|||
)
|
||||
|
||||
|
||||
_rate_limiter_instances: Set["FederationRateLimiter"] = set()
|
||||
# Protects the _rate_limiter_instances set from concurrent access
|
||||
_rate_limiter_instances_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_counts_from_rate_limiter_instance(
|
||||
count_func: Callable[["FederationRateLimiter"], int]
|
||||
) -> Mapping[Tuple[str, ...], int]:
|
||||
"""Returns a count of something (slept/rejected hosts) by (metrics_name)"""
|
||||
# Cast to a list to prevent it changing while the Prometheus
|
||||
# thread is collecting metrics
|
||||
with _rate_limiter_instances_lock:
|
||||
rate_limiter_instances = list(_rate_limiter_instances)
|
||||
|
||||
# Map from (metrics_name,) -> int, the number of something like slept hosts
|
||||
# or rejected hosts. The key type is Tuple[str], but we leave the length
|
||||
# unspecified for compatability with LaterGauge's annotations.
|
||||
counts: Dict[Tuple[str, ...], int] = {}
|
||||
for rate_limiter_instance in rate_limiter_instances:
|
||||
# Only track metrics if they provided a `metrics_name` to
|
||||
# differentiate this instance of the rate limiter.
|
||||
if rate_limiter_instance.metrics_name:
|
||||
key = (rate_limiter_instance.metrics_name,)
|
||||
counts[key] = count_func(rate_limiter_instance)
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
# We track the number of affected hosts per time-period so we can
|
||||
# differentiate one really noisy homeserver from a general
|
||||
# ratelimit tuning problem across the federation.
|
||||
LaterGauge(
|
||||
"synapse_rate_limit_sleep_affected_hosts",
|
||||
"Number of hosts that had requests put to sleep",
|
||||
["rate_limiter_name"],
|
||||
lambda: _get_counts_from_rate_limiter_instance(
|
||||
lambda rate_limiter_instance: sum(
|
||||
ratelimiter.should_sleep()
|
||||
for ratelimiter in rate_limiter_instance.ratelimiters.values()
|
||||
)
|
||||
),
|
||||
)
|
||||
LaterGauge(
|
||||
"synapse_rate_limit_reject_affected_hosts",
|
||||
"Number of hosts that had requests rejected",
|
||||
["rate_limiter_name"],
|
||||
lambda: _get_counts_from_rate_limiter_instance(
|
||||
lambda rate_limiter_instance: sum(
|
||||
ratelimiter.should_reject()
|
||||
for ratelimiter in rate_limiter_instance.ratelimiters.values()
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class FederationRateLimiter:
|
||||
def __init__(self, clock: Clock, config: FederationRatelimitSettings):
|
||||
"""Used to rate limit request per-host."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
clock: Clock,
|
||||
config: FederationRatelimitSettings,
|
||||
metrics_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
clock
|
||||
config
|
||||
metrics_name: The name of the rate limiter so we can differentiate it
|
||||
from the rest in the metrics. If `None`, we don't track metrics
|
||||
for this rate limiter.
|
||||
|
||||
"""
|
||||
self.metrics_name = metrics_name
|
||||
|
||||
def new_limiter() -> "_PerHostRatelimiter":
|
||||
return _PerHostRatelimiter(clock=clock, config=config)
|
||||
return _PerHostRatelimiter(
|
||||
clock=clock, config=config, metrics_name=metrics_name
|
||||
)
|
||||
|
||||
self.ratelimiters: DefaultDict[
|
||||
str, "_PerHostRatelimiter"
|
||||
] = collections.defaultdict(new_limiter)
|
||||
|
||||
# We track the number of affected hosts per time-period so we can
|
||||
# differentiate one really noisy homeserver from a general
|
||||
# ratelimit tuning problem across the federation.
|
||||
LaterGauge(
|
||||
"synapse_rate_limit_sleep_affected_hosts",
|
||||
"Number of hosts that had requests put to sleep",
|
||||
[],
|
||||
lambda: sum(
|
||||
ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values()
|
||||
),
|
||||
)
|
||||
LaterGauge(
|
||||
"synapse_rate_limit_reject_affected_hosts",
|
||||
"Number of hosts that had requests rejected",
|
||||
[],
|
||||
lambda: sum(
|
||||
ratelimiter.should_reject()
|
||||
for ratelimiter in self.ratelimiters.values()
|
||||
),
|
||||
)
|
||||
with _rate_limiter_instances_lock:
|
||||
_rate_limiter_instances.add(self)
|
||||
|
||||
def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]":
|
||||
"""Used to ratelimit an incoming request from a given host
|
||||
|
@ -114,13 +192,23 @@ class FederationRateLimiter:
|
|||
|
||||
|
||||
class _PerHostRatelimiter:
|
||||
def __init__(self, clock: Clock, config: FederationRatelimitSettings):
|
||||
def __init__(
|
||||
self,
|
||||
clock: Clock,
|
||||
config: FederationRatelimitSettings,
|
||||
metrics_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
clock
|
||||
config
|
||||
metrics_name: The name of the rate limiter so we can differentiate it
|
||||
from the rest in the metrics. If `None`, we don't track metrics
|
||||
for this rate limiter.
|
||||
from the rest in the metrics
|
||||
"""
|
||||
self.clock = clock
|
||||
self.metrics_name = metrics_name
|
||||
|
||||
self.window_size = config.window_size
|
||||
self.sleep_limit = config.sleep_limit
|
||||
|
@ -178,7 +266,10 @@ class _PerHostRatelimiter:
|
|||
return len(self.request_times) > self.sleep_limit
|
||||
|
||||
async def _on_enter_with_tracing(self, request_id: object) -> None:
|
||||
with start_active_span("ratelimit wait"), queue_wait_timer.time():
|
||||
maybe_metrics_cm: ContextManager = contextlib.nullcontext()
|
||||
if self.metrics_name:
|
||||
maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time()
|
||||
with start_active_span("ratelimit wait"), maybe_metrics_cm:
|
||||
await self._on_enter(request_id)
|
||||
|
||||
def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
|
||||
|
@ -193,7 +284,8 @@ class _PerHostRatelimiter:
|
|||
# sleeping or in the ready queue).
|
||||
if self.should_reject():
|
||||
logger.debug("Ratelimiter(%s): rejecting request", self.host)
|
||||
rate_limit_reject_counter.inc()
|
||||
if self.metrics_name:
|
||||
rate_limit_reject_counter.labels(self.metrics_name).inc()
|
||||
raise LimitExceededError(
|
||||
retry_after_ms=int(self.window_size / self.sleep_limit)
|
||||
)
|
||||
|
@ -228,7 +320,8 @@ class _PerHostRatelimiter:
|
|||
id(request_id),
|
||||
self.sleep_sec,
|
||||
)
|
||||
rate_limit_sleep_counter.inc()
|
||||
if self.metrics_name:
|
||||
rate_limit_sleep_counter.labels(self.metrics_name).inc()
|
||||
ret_defer = run_in_background(self.clock.sleep, self.sleep_sec)
|
||||
|
||||
self.sleeping_requests.add(request_id)
|
||||
|
|
Loading…
Reference in New Issue