2018-07-18 03:50:33 -06:00
|
|
|
# Copyright 2018 New Vector Ltd
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2018-09-20 10:05:34 -06:00
|
|
|
import logging
|
2018-08-20 10:16:58 -06:00
|
|
|
import threading
|
2022-05-27 04:17:33 -06:00
|
|
|
from contextlib import nullcontext
|
2019-09-24 07:16:16 -06:00
|
|
|
from functools import wraps
|
2021-11-17 12:07:02 -07:00
|
|
|
from types import TracebackType
|
|
|
|
from typing import (
|
|
|
|
TYPE_CHECKING,
|
|
|
|
Any,
|
|
|
|
Awaitable,
|
|
|
|
Callable,
|
|
|
|
Dict,
|
|
|
|
Iterable,
|
|
|
|
Optional,
|
|
|
|
Set,
|
|
|
|
Type,
|
|
|
|
TypeVar,
|
|
|
|
Union,
|
|
|
|
)
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
from prometheus_client import Metric
|
2020-05-29 06:25:32 -06:00
|
|
|
from prometheus_client.core import REGISTRY, Counter, Gauge
|
2022-05-06 06:35:20 -06:00
|
|
|
from typing_extensions import ParamSpec
|
2018-07-18 03:50:33 -06:00
|
|
|
|
|
|
|
from twisted.internet import defer
|
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
from synapse.logging.context import (
|
|
|
|
ContextResourceUsage,
|
|
|
|
LoggingContext,
|
|
|
|
PreserveLoggingContext,
|
|
|
|
)
|
2022-05-27 04:17:33 -06:00
|
|
|
from synapse.logging.opentracing import SynapseTags, start_active_span
|
2022-04-06 06:59:04 -06:00
|
|
|
from synapse.metrics._types import Collector
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2020-05-29 06:25:32 -06:00
|
|
|
if TYPE_CHECKING:
|
|
|
|
import resource
|
|
|
|
|
|
|
|
|
2018-09-20 10:05:34 -06:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2018-07-18 03:50:33 -06:00
|
|
|
_background_process_start_count = Counter(
|
|
|
|
"synapse_background_process_start_count",
|
|
|
|
"Number of background processes started",
|
|
|
|
["name"],
|
|
|
|
)
|
|
|
|
|
2020-05-29 06:25:32 -06:00
|
|
|
_background_process_in_flight_count = Gauge(
|
|
|
|
"synapse_background_process_in_flight_count",
|
|
|
|
"Number of background processes in flight",
|
|
|
|
labelnames=["name"],
|
|
|
|
)
|
|
|
|
|
2018-07-18 03:50:33 -06:00
|
|
|
# we set registry=None in all of these to stop them getting registered with
|
|
|
|
# the default registry. Instead we collect them all via the CustomCollector,
|
|
|
|
# which ensures that we can update them before they are collected.
|
|
|
|
#
|
|
|
|
_background_process_ru_utime = Counter(
|
|
|
|
"synapse_background_process_ru_utime_seconds",
|
|
|
|
"User CPU time used by background processes, in seconds",
|
|
|
|
["name"],
|
|
|
|
registry=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
_background_process_ru_stime = Counter(
|
|
|
|
"synapse_background_process_ru_stime_seconds",
|
|
|
|
"System CPU time used by background processes, in seconds",
|
|
|
|
["name"],
|
|
|
|
registry=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
_background_process_db_txn_count = Counter(
|
|
|
|
"synapse_background_process_db_txn_count",
|
|
|
|
"Number of database transactions done by background processes",
|
|
|
|
["name"],
|
|
|
|
registry=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
_background_process_db_txn_duration = Counter(
|
|
|
|
"synapse_background_process_db_txn_duration_seconds",
|
|
|
|
(
|
|
|
|
"Seconds spent by background processes waiting for database "
|
|
|
|
"transactions, excluding scheduling time"
|
|
|
|
),
|
|
|
|
["name"],
|
|
|
|
registry=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
_background_process_db_sched_duration = Counter(
|
|
|
|
"synapse_background_process_db_sched_duration_seconds",
|
|
|
|
"Seconds spent by background processes waiting for database connections",
|
|
|
|
["name"],
|
|
|
|
registry=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
# map from description to a counter, so that we can name our logcontexts
|
|
|
|
# incrementally. (It actually duplicates _background_process_start_count, but
|
|
|
|
# it's much simpler to do so than to try to combine them.)
|
2021-07-15 04:02:43 -06:00
|
|
|
_background_process_counts: Dict[str, int] = {}
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2020-05-29 06:25:32 -06:00
|
|
|
# Set of all running background processes that became active active since the
|
|
|
|
# last time metrics were scraped (i.e. background processes that performed some
|
|
|
|
# work since the last scrape.)
|
2018-07-18 03:50:33 -06:00
|
|
|
#
|
2020-05-29 06:25:32 -06:00
|
|
|
# We do it like this to handle the case where we have a large number of
|
|
|
|
# background processes stacking up behind a lock or linearizer, where we then
|
|
|
|
# only need to iterate over and update metrics for the process that have
|
|
|
|
# actually been active and can ignore the idle ones.
|
2021-07-15 04:02:43 -06:00
|
|
|
_background_processes_active_since_last_scrape: "Set[_BackgroundProcess]" = set()
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2020-05-29 06:25:32 -06:00
|
|
|
# A lock that covers the above set and dict
|
2018-08-20 10:16:58 -06:00
|
|
|
_bg_metrics_lock = threading.Lock()
|
|
|
|
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2022-04-06 06:59:04 -06:00
|
|
|
class _Collector(Collector):
|
2018-07-18 03:50:33 -06:00
|
|
|
"""A custom metrics collector for the background process metrics.
|
|
|
|
|
|
|
|
Ensures that all of the metrics are up-to-date with any in-flight processes
|
|
|
|
before they are returned.
|
|
|
|
"""
|
2019-06-20 03:32:02 -06:00
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
def collect(self) -> Iterable[Metric]:
|
2020-05-29 06:25:32 -06:00
|
|
|
global _background_processes_active_since_last_scrape
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2020-05-29 06:25:32 -06:00
|
|
|
# We swap out the _background_processes set with an empty one so that
|
|
|
|
# we can safely iterate over the set without holding the lock.
|
2018-08-20 10:16:58 -06:00
|
|
|
with _bg_metrics_lock:
|
2020-05-29 06:25:32 -06:00
|
|
|
_background_processes_copy = _background_processes_active_since_last_scrape
|
|
|
|
_background_processes_active_since_last_scrape = set()
|
2018-08-20 10:16:58 -06:00
|
|
|
|
2020-05-29 06:25:32 -06:00
|
|
|
for process in _background_processes_copy:
|
|
|
|
process.update_metrics()
|
2018-07-18 03:50:33 -06:00
|
|
|
|
|
|
|
# now we need to run collect() over each of the static Counters, and
|
|
|
|
# yield each metric they return.
|
|
|
|
for m in (
|
|
|
|
_background_process_ru_utime,
|
|
|
|
_background_process_ru_stime,
|
|
|
|
_background_process_db_txn_count,
|
|
|
|
_background_process_db_txn_duration,
|
|
|
|
_background_process_db_sched_duration,
|
|
|
|
):
|
2021-07-19 08:28:05 -06:00
|
|
|
yield from m.collect()
|
2018-07-18 03:50:33 -06:00
|
|
|
|
|
|
|
|
|
|
|
REGISTRY.register(_Collector())
|
|
|
|
|
|
|
|
|
2020-09-04 04:54:56 -06:00
|
|
|
class _BackgroundProcess:
|
2021-11-17 12:07:02 -07:00
|
|
|
def __init__(self, desc: str, ctx: LoggingContext):
|
2018-07-18 03:50:33 -06:00
|
|
|
self.desc = desc
|
|
|
|
self._context = ctx
|
2021-11-17 12:07:02 -07:00
|
|
|
self._reported_stats: Optional[ContextResourceUsage] = None
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
def update_metrics(self) -> None:
|
2018-07-18 03:50:33 -06:00
|
|
|
"""Updates the metrics with values from this process."""
|
|
|
|
new_stats = self._context.get_resource_usage()
|
|
|
|
if self._reported_stats is None:
|
|
|
|
diff = new_stats
|
|
|
|
else:
|
|
|
|
diff = new_stats - self._reported_stats
|
|
|
|
self._reported_stats = new_stats
|
|
|
|
|
2022-10-31 07:02:07 -06:00
|
|
|
# For unknown reasons, the difference in times can be negative. See comment in
|
|
|
|
# synapse.http.request_metrics.RequestMetrics.update_metrics.
|
|
|
|
_background_process_ru_utime.labels(self.desc).inc(max(diff.ru_utime, 0))
|
|
|
|
_background_process_ru_stime.labels(self.desc).inc(max(diff.ru_stime, 0))
|
2018-07-18 03:50:33 -06:00
|
|
|
_background_process_db_txn_count.labels(self.desc).inc(diff.db_txn_count)
|
|
|
|
_background_process_db_txn_duration.labels(self.desc).inc(
|
|
|
|
diff.db_txn_duration_sec
|
|
|
|
)
|
|
|
|
_background_process_db_sched_duration.labels(self.desc).inc(
|
|
|
|
diff.db_sched_duration_sec
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
R = TypeVar("R")
|
|
|
|
|
|
|
|
|
|
|
|
def run_as_background_process(
|
|
|
|
desc: str,
|
|
|
|
func: Callable[..., Awaitable[Optional[R]]],
|
|
|
|
*args: Any,
|
|
|
|
bg_start_span: bool = True,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> "defer.Deferred[Optional[R]]":
|
2018-07-18 03:50:33 -06:00
|
|
|
"""Run the given function in its own logcontext, with resource metrics
|
|
|
|
|
|
|
|
This should be used to wrap processes which are fired off to run in the
|
|
|
|
background, instead of being associated with a particular request.
|
|
|
|
|
2018-07-26 04:44:26 -06:00
|
|
|
It returns a Deferred which completes when the function completes, but it doesn't
|
|
|
|
follow the synapse logcontext rules, which makes it appropriate for passing to
|
|
|
|
clock.looping_call and friends (or for firing-and-forgetting in the middle of a
|
2020-08-19 05:09:24 -06:00
|
|
|
normal synapse async function).
|
2018-07-26 04:44:26 -06:00
|
|
|
|
2018-07-18 03:50:33 -06:00
|
|
|
Args:
|
2020-08-06 06:20:42 -06:00
|
|
|
desc: a description for this background process type
|
2019-09-25 11:17:39 -06:00
|
|
|
func: a function, which may return a Deferred or a coroutine
|
2020-10-26 03:30:19 -06:00
|
|
|
bg_start_span: Whether to start an opentracing span. Defaults to True.
|
|
|
|
Should only be disabled for processes that will not log to or tag
|
|
|
|
a span.
|
2018-07-18 03:50:33 -06:00
|
|
|
args: positional args for func
|
|
|
|
kwargs: keyword args for func
|
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
Returns:
|
|
|
|
Deferred which returns the result of func, or `None` if func raises.
|
|
|
|
Note that the returned Deferred does not follow the synapse logcontext
|
|
|
|
rules.
|
2018-07-18 03:50:33 -06:00
|
|
|
"""
|
2019-06-20 03:32:02 -06:00
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
async def run() -> Optional[R]:
|
2018-08-20 10:16:58 -06:00
|
|
|
with _bg_metrics_lock:
|
|
|
|
count = _background_process_counts.get(desc, 0)
|
|
|
|
_background_process_counts[desc] = count + 1
|
|
|
|
|
2018-07-18 03:50:33 -06:00
|
|
|
_background_process_start_count.labels(desc).inc()
|
2020-05-29 06:25:32 -06:00
|
|
|
_background_process_in_flight_count.labels(desc).inc()
|
2018-07-18 03:50:33 -06:00
|
|
|
|
2021-04-21 03:03:31 -06:00
|
|
|
with BackgroundProcessLoggingContext(desc, count) as context:
|
2018-07-18 03:50:33 -06:00
|
|
|
try:
|
2020-10-26 03:30:19 -06:00
|
|
|
if bg_start_span:
|
2021-05-28 09:14:08 -06:00
|
|
|
ctx = start_active_span(
|
2021-06-07 10:57:49 -06:00
|
|
|
f"bgproc.{desc}", tags={SynapseTags.REQUEST_ID: str(context)}
|
2021-05-28 09:14:08 -06:00
|
|
|
)
|
2021-06-07 10:57:49 -06:00
|
|
|
else:
|
2022-07-21 06:01:52 -06:00
|
|
|
ctx = nullcontext() # type: ignore[assignment]
|
2020-10-26 03:30:19 -06:00
|
|
|
with ctx:
|
2021-11-17 12:07:02 -07:00
|
|
|
return await func(*args, **kwargs)
|
2018-09-20 09:15:21 -06:00
|
|
|
except Exception:
|
2020-08-06 06:20:42 -06:00
|
|
|
logger.exception(
|
|
|
|
"Background process '%s' threw an exception",
|
|
|
|
desc,
|
2020-07-09 06:01:33 -06:00
|
|
|
)
|
2021-11-17 12:07:02 -07:00
|
|
|
return None
|
2018-07-18 03:50:33 -06:00
|
|
|
finally:
|
2020-05-29 06:25:32 -06:00
|
|
|
_background_process_in_flight_count.labels(desc).dec()
|
2018-07-18 03:50:33 -06:00
|
|
|
|
|
|
|
with PreserveLoggingContext():
|
2020-08-06 06:20:42 -06:00
|
|
|
# Note that we return a Deferred here so that it can be used in a
|
|
|
|
# looping_call and other places that expect a Deferred.
|
|
|
|
return defer.ensureDeferred(run())
|
2019-09-24 07:16:16 -06:00
|
|
|
|
|
|
|
|
2022-05-06 06:35:20 -06:00
|
|
|
P = ParamSpec("P")
|
2021-11-17 12:07:02 -07:00
|
|
|
|
|
|
|
|
2022-05-06 06:35:20 -06:00
|
|
|
def wrap_as_background_process(
|
|
|
|
desc: str,
|
|
|
|
) -> Callable[
|
|
|
|
[Callable[P, Awaitable[Optional[R]]]],
|
|
|
|
Callable[P, "defer.Deferred[Optional[R]]"],
|
|
|
|
]:
|
|
|
|
"""Decorator that wraps an asynchronous function `func`, returning a synchronous
|
|
|
|
decorated function. Calling the decorated version runs `func` as a background
|
|
|
|
process, forwarding all arguments verbatim.
|
|
|
|
|
|
|
|
That is,
|
|
|
|
|
|
|
|
@wrap_as_background_process
|
|
|
|
def func(*args): ...
|
|
|
|
func(1, 2, third=3)
|
|
|
|
|
|
|
|
is equivalent to:
|
|
|
|
|
|
|
|
def func(*args): ...
|
|
|
|
run_as_background_process(func, 1, 2, third=3)
|
2019-09-24 07:16:16 -06:00
|
|
|
|
2022-05-06 06:35:20 -06:00
|
|
|
The former can be convenient if `func` needs to be run as a background process in
|
|
|
|
multiple places.
|
2019-09-24 07:16:16 -06:00
|
|
|
"""
|
|
|
|
|
2022-05-06 06:35:20 -06:00
|
|
|
def wrap_as_background_process_inner(
|
|
|
|
func: Callable[P, Awaitable[Optional[R]]]
|
|
|
|
) -> Callable[P, "defer.Deferred[Optional[R]]"]:
|
2019-09-24 07:16:16 -06:00
|
|
|
@wraps(func)
|
2021-11-17 12:07:02 -07:00
|
|
|
def wrap_as_background_process_inner_2(
|
2022-05-06 06:35:20 -06:00
|
|
|
*args: P.args, **kwargs: P.kwargs
|
2021-11-17 12:07:02 -07:00
|
|
|
) -> "defer.Deferred[Optional[R]]":
|
2022-05-06 06:35:20 -06:00
|
|
|
# type-ignore: mypy is confusing kwargs with the bg_start_span kwarg.
|
|
|
|
# Argument 4 to "run_as_background_process" has incompatible type
|
|
|
|
# "**P.kwargs"; expected "bool"
|
|
|
|
# See https://github.com/python/mypy/issues/8862
|
|
|
|
return run_as_background_process(desc, func, *args, **kwargs) # type: ignore[arg-type]
|
2019-09-24 07:16:16 -06:00
|
|
|
|
2022-05-06 06:35:20 -06:00
|
|
|
return wrap_as_background_process_inner_2
|
2019-09-24 07:16:16 -06:00
|
|
|
|
|
|
|
return wrap_as_background_process_inner
|
2020-05-29 06:25:32 -06:00
|
|
|
|
|
|
|
|
|
|
|
class BackgroundProcessLoggingContext(LoggingContext):
|
|
|
|
"""A logging context that tracks in flight metrics for background
|
|
|
|
processes.
|
|
|
|
"""
|
|
|
|
|
2021-04-20 07:19:00 -06:00
|
|
|
__slots__ = ["_proc"]
|
2020-05-29 06:25:32 -06:00
|
|
|
|
2021-04-21 03:03:31 -06:00
|
|
|
def __init__(self, name: str, instance_id: Optional[Union[int, str]] = None):
|
|
|
|
"""
|
|
|
|
|
|
|
|
Args:
|
|
|
|
name: The name of the background process. Each distinct `name` gets a
|
|
|
|
separate prometheus time series.
|
|
|
|
|
|
|
|
instance_id: an identifer to add to `name` to distinguish this instance of
|
|
|
|
the named background process in the logs. If this is `None`, one is
|
|
|
|
made up based on id(self).
|
|
|
|
"""
|
|
|
|
if instance_id is None:
|
|
|
|
instance_id = id(self)
|
|
|
|
super().__init__("%s-%s" % (name, instance_id))
|
2020-05-29 06:25:32 -06:00
|
|
|
self._proc = _BackgroundProcess(name, self)
|
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
def start(self, rusage: "Optional[resource.struct_rusage]") -> None:
|
2020-05-29 06:25:32 -06:00
|
|
|
"""Log context has started running (again)."""
|
|
|
|
|
|
|
|
super().start(rusage)
|
|
|
|
|
|
|
|
# We've become active again so we make sure we're in the list of active
|
|
|
|
# procs. (Note that "start" here means we've become active, as opposed
|
|
|
|
# to starting for the first time.)
|
|
|
|
with _bg_metrics_lock:
|
|
|
|
_background_processes_active_since_last_scrape.add(self._proc)
|
|
|
|
|
2021-11-17 12:07:02 -07:00
|
|
|
def __exit__(
|
|
|
|
self,
|
|
|
|
type: Optional[Type[BaseException]],
|
|
|
|
value: Optional[BaseException],
|
|
|
|
traceback: Optional[TracebackType],
|
|
|
|
) -> None:
|
2020-05-29 06:25:32 -06:00
|
|
|
"""Log context has finished."""
|
|
|
|
|
|
|
|
super().__exit__(type, value, traceback)
|
|
|
|
|
2020-10-23 10:38:40 -06:00
|
|
|
# The background process has finished. We explicitly remove and manually
|
2020-05-29 06:25:32 -06:00
|
|
|
# update the metrics here so that if nothing is scraping metrics the set
|
|
|
|
# doesn't infinitely grow.
|
|
|
|
with _bg_metrics_lock:
|
|
|
|
_background_processes_active_since_last_scrape.discard(self._proc)
|
|
|
|
|
|
|
|
self._proc.update_metrics()
|