Simplify GC prometheus metrics (#11723)
Rather than hooking into the reactor loop, just add a timed task that runs every 100 ms to do the garbage collection. Part 1 of a quest to simplify the reactor monkey-patching.
This commit is contained in:
parent
10a88ba91c
commit
20c6d85c6e
|
@ -0,0 +1 @@
|
|||
Simplify calculation of prometheus metrics for garbage collection.
|
|
@ -60,7 +60,7 @@ from synapse.events.spamcheck import load_legacy_spam_checkers
|
|||
from synapse.events.third_party_rules import load_legacy_third_party_event_rules
|
||||
from synapse.handlers.auth import load_legacy_password_auth_providers
|
||||
from synapse.logging.context import PreserveLoggingContext
|
||||
from synapse.metrics import register_threadpool
|
||||
from synapse.metrics import install_gc_manager, register_threadpool
|
||||
from synapse.metrics.background_process_metrics import wrap_as_background_process
|
||||
from synapse.metrics.jemalloc import setup_jemalloc_stats
|
||||
from synapse.types import ISynapseReactor
|
||||
|
@ -159,6 +159,7 @@ def start_reactor(
|
|||
change_resource_limit(soft_file_limit)
|
||||
if gc_thresholds:
|
||||
gc.set_threshold(*gc_thresholds)
|
||||
install_gc_manager()
|
||||
run_command()
|
||||
|
||||
# make sure that we run the reactor with the sentinel log context,
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
# limitations under the License.
|
||||
|
||||
import functools
|
||||
import gc
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
|
@ -41,7 +40,6 @@ import attr
|
|||
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Metric
|
||||
from prometheus_client.core import (
|
||||
REGISTRY,
|
||||
CounterMetricFamily,
|
||||
GaugeHistogramMetricFamily,
|
||||
GaugeMetricFamily,
|
||||
)
|
||||
|
@ -56,13 +54,13 @@ from synapse.metrics._exposition import (
|
|||
generate_latest,
|
||||
start_http_server,
|
||||
)
|
||||
from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager
|
||||
from synapse.util.versionstring import get_version_string
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
METRICS_PREFIX = "/_synapse/metrics"
|
||||
|
||||
running_on_pypy = platform.python_implementation() == "PyPy"
|
||||
all_gauges: "Dict[str, Union[LaterGauge, InFlightGauge]]" = {}
|
||||
|
||||
HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
|
||||
|
@ -369,121 +367,6 @@ class CPUMetrics:
|
|||
|
||||
REGISTRY.register(CPUMetrics())
|
||||
|
||||
#
|
||||
# Python GC metrics
|
||||
#
|
||||
|
||||
gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
|
||||
gc_time = Histogram(
|
||||
"python_gc_time",
|
||||
"Time taken to GC (sec)",
|
||||
["gen"],
|
||||
buckets=[
|
||||
0.0025,
|
||||
0.005,
|
||||
0.01,
|
||||
0.025,
|
||||
0.05,
|
||||
0.10,
|
||||
0.25,
|
||||
0.50,
|
||||
1.00,
|
||||
2.50,
|
||||
5.00,
|
||||
7.50,
|
||||
15.00,
|
||||
30.00,
|
||||
45.00,
|
||||
60.00,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class GCCounts:
|
||||
def collect(self) -> Iterable[Metric]:
|
||||
cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
|
||||
for n, m in enumerate(gc.get_count()):
|
||||
cm.add_metric([str(n)], m)
|
||||
|
||||
yield cm
|
||||
|
||||
|
||||
if not running_on_pypy:
|
||||
REGISTRY.register(GCCounts())
|
||||
|
||||
|
||||
#
|
||||
# PyPy GC / memory metrics
|
||||
#
|
||||
|
||||
|
||||
class PyPyGCStats:
|
||||
def collect(self) -> Iterable[Metric]:
|
||||
|
||||
# @stats is a pretty-printer object with __str__() returning a nice table,
|
||||
# plus some fields that contain data from that table.
|
||||
# unfortunately, fields are pretty-printed themselves (i. e. '4.5MB').
|
||||
stats = gc.get_stats(memory_pressure=False) # type: ignore
|
||||
# @s contains same fields as @stats, but as actual integers.
|
||||
s = stats._s # type: ignore
|
||||
|
||||
# also note that field naming is completely braindead
|
||||
# and only vaguely correlates with the pretty-printed table.
|
||||
# >>>> gc.get_stats(False)
|
||||
# Total memory consumed:
|
||||
# GC used: 8.7MB (peak: 39.0MB) # s.total_gc_memory, s.peak_memory
|
||||
# in arenas: 3.0MB # s.total_arena_memory
|
||||
# rawmalloced: 1.7MB # s.total_rawmalloced_memory
|
||||
# nursery: 4.0MB # s.nursery_size
|
||||
# raw assembler used: 31.0kB # s.jit_backend_used
|
||||
# -----------------------------
|
||||
# Total: 8.8MB # stats.memory_used_sum
|
||||
#
|
||||
# Total memory allocated:
|
||||
# GC allocated: 38.7MB (peak: 41.1MB) # s.total_allocated_memory, s.peak_allocated_memory
|
||||
# in arenas: 30.9MB # s.peak_arena_memory
|
||||
# rawmalloced: 4.1MB # s.peak_rawmalloced_memory
|
||||
# nursery: 4.0MB # s.nursery_size
|
||||
# raw assembler allocated: 1.0MB # s.jit_backend_allocated
|
||||
# -----------------------------
|
||||
# Total: 39.7MB # stats.memory_allocated_sum
|
||||
#
|
||||
# Total time spent in GC: 0.073 # s.total_gc_time
|
||||
|
||||
pypy_gc_time = CounterMetricFamily(
|
||||
"pypy_gc_time_seconds_total",
|
||||
"Total time spent in PyPy GC",
|
||||
labels=[],
|
||||
)
|
||||
pypy_gc_time.add_metric([], s.total_gc_time / 1000)
|
||||
yield pypy_gc_time
|
||||
|
||||
pypy_mem = GaugeMetricFamily(
|
||||
"pypy_memory_bytes",
|
||||
"Memory tracked by PyPy allocator",
|
||||
labels=["state", "class", "kind"],
|
||||
)
|
||||
# memory used by JIT assembler
|
||||
pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used)
|
||||
pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated)
|
||||
# memory used by GCed objects
|
||||
pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory)
|
||||
pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory)
|
||||
pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory)
|
||||
pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory)
|
||||
pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size)
|
||||
pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size)
|
||||
# totals
|
||||
pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory)
|
||||
pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory)
|
||||
pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory)
|
||||
pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory)
|
||||
yield pypy_mem
|
||||
|
||||
|
||||
if running_on_pypy:
|
||||
REGISTRY.register(PyPyGCStats())
|
||||
|
||||
|
||||
#
|
||||
# Twisted reactor metrics
|
||||
|
@ -612,14 +495,6 @@ class ReactorLastSeenMetric:
|
|||
|
||||
REGISTRY.register(ReactorLastSeenMetric())
|
||||
|
||||
# The minimum time in seconds between GCs for each generation, regardless of the current GC
|
||||
# thresholds and counts.
|
||||
MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
|
||||
|
||||
# The time (in seconds since the epoch) of the last time we did a GC for each generation.
|
||||
_last_gc = [0.0, 0.0, 0.0]
|
||||
|
||||
|
||||
F = TypeVar("F", bound=Callable[..., Any])
|
||||
|
||||
|
||||
|
@ -658,34 +533,6 @@ def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
|
|||
global last_ticked
|
||||
last_ticked = end
|
||||
|
||||
if running_on_pypy:
|
||||
return ret
|
||||
|
||||
# Check if we need to do a manual GC (since its been disabled), and do
|
||||
# one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
|
||||
# promote an object into gen 2, and we don't want to handle the same
|
||||
# object multiple times.
|
||||
threshold = gc.get_threshold()
|
||||
counts = gc.get_count()
|
||||
for i in (2, 1, 0):
|
||||
# We check if we need to do one based on a straightforward
|
||||
# comparison between the threshold and count. We also do an extra
|
||||
# check to make sure that we don't a GC too often.
|
||||
if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
|
||||
if i == 0:
|
||||
logger.debug("Collecting gc %d", i)
|
||||
else:
|
||||
logger.info("Collecting gc %d", i)
|
||||
|
||||
start = time.time()
|
||||
unreachable = gc.collect(i)
|
||||
end = time.time()
|
||||
|
||||
_last_gc[i] = end
|
||||
|
||||
gc_time.labels(i).observe(end - start)
|
||||
gc_unreachable.labels(i).set(unreachable)
|
||||
|
||||
return ret
|
||||
|
||||
return cast(F, f)
|
||||
|
@ -701,11 +548,6 @@ try:
|
|||
# runUntilCurrent is called when we have pending calls. It is called once
|
||||
# per iteratation after fd polling.
|
||||
reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent) # type: ignore
|
||||
|
||||
# We manually run the GC each reactor tick so that we can get some metrics
|
||||
# about time spent doing GC,
|
||||
if not running_on_pypy:
|
||||
gc.disable()
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
|
@ -717,4 +559,6 @@ __all__ = [
|
|||
"LaterGauge",
|
||||
"InFlightGauge",
|
||||
"GaugeBucketCollector",
|
||||
"MIN_TIME_BETWEEN_GCS",
|
||||
"install_gc_manager",
|
||||
]
|
||||
|
|
|
@ -0,0 +1,203 @@
|
|||
# Copyright 2015-2022 The Matrix.org Foundation C.I.C.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import gc
|
||||
import logging
|
||||
import platform
|
||||
import time
|
||||
from typing import Iterable
|
||||
|
||||
from prometheus_client.core import (
|
||||
REGISTRY,
|
||||
CounterMetricFamily,
|
||||
Gauge,
|
||||
GaugeMetricFamily,
|
||||
Histogram,
|
||||
Metric,
|
||||
)
|
||||
|
||||
from twisted.internet import task
|
||||
|
||||
"""Prometheus metrics for garbage collection"""
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# The minimum time in seconds between GCs for each generation, regardless of the current GC
|
||||
# thresholds and counts.
|
||||
MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
|
||||
|
||||
running_on_pypy = platform.python_implementation() == "PyPy"
|
||||
|
||||
#
|
||||
# Python GC metrics
|
||||
#
|
||||
|
||||
gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
|
||||
gc_time = Histogram(
|
||||
"python_gc_time",
|
||||
"Time taken to GC (sec)",
|
||||
["gen"],
|
||||
buckets=[
|
||||
0.0025,
|
||||
0.005,
|
||||
0.01,
|
||||
0.025,
|
||||
0.05,
|
||||
0.10,
|
||||
0.25,
|
||||
0.50,
|
||||
1.00,
|
||||
2.50,
|
||||
5.00,
|
||||
7.50,
|
||||
15.00,
|
||||
30.00,
|
||||
45.00,
|
||||
60.00,
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class GCCounts:
|
||||
def collect(self) -> Iterable[Metric]:
|
||||
cm = GaugeMetricFamily("python_gc_counts", "GC object counts", labels=["gen"])
|
||||
for n, m in enumerate(gc.get_count()):
|
||||
cm.add_metric([str(n)], m)
|
||||
|
||||
yield cm
|
||||
|
||||
|
||||
def install_gc_manager() -> None:
|
||||
"""Disable automatic GC, and replace it with a task that runs every 100ms
|
||||
|
||||
This means that (a) we can limit how often GC runs; (b) we can get some metrics
|
||||
about GC activity.
|
||||
|
||||
It does nothing on PyPy.
|
||||
"""
|
||||
|
||||
if running_on_pypy:
|
||||
return
|
||||
|
||||
REGISTRY.register(GCCounts())
|
||||
|
||||
gc.disable()
|
||||
|
||||
# The time (in seconds since the epoch) of the last time we did a GC for each generation.
|
||||
_last_gc = [0.0, 0.0, 0.0]
|
||||
|
||||
def _maybe_gc() -> None:
|
||||
# Check if we need to do a manual GC (since its been disabled), and do
|
||||
# one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
|
||||
# promote an object into gen 2, and we don't want to handle the same
|
||||
# object multiple times.
|
||||
threshold = gc.get_threshold()
|
||||
counts = gc.get_count()
|
||||
end = time.time()
|
||||
for i in (2, 1, 0):
|
||||
# We check if we need to do one based on a straightforward
|
||||
# comparison between the threshold and count. We also do an extra
|
||||
# check to make sure that we don't a GC too often.
|
||||
if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
|
||||
if i == 0:
|
||||
logger.debug("Collecting gc %d", i)
|
||||
else:
|
||||
logger.info("Collecting gc %d", i)
|
||||
|
||||
start = time.time()
|
||||
unreachable = gc.collect(i)
|
||||
end = time.time()
|
||||
|
||||
_last_gc[i] = end
|
||||
|
||||
gc_time.labels(i).observe(end - start)
|
||||
gc_unreachable.labels(i).set(unreachable)
|
||||
|
||||
gc_task = task.LoopingCall(_maybe_gc)
|
||||
gc_task.start(0.1)
|
||||
|
||||
|
||||
#
|
||||
# PyPy GC / memory metrics
|
||||
#
|
||||
|
||||
|
||||
class PyPyGCStats:
|
||||
def collect(self) -> Iterable[Metric]:
|
||||
|
||||
# @stats is a pretty-printer object with __str__() returning a nice table,
|
||||
# plus some fields that contain data from that table.
|
||||
# unfortunately, fields are pretty-printed themselves (i. e. '4.5MB').
|
||||
stats = gc.get_stats(memory_pressure=False) # type: ignore
|
||||
# @s contains same fields as @stats, but as actual integers.
|
||||
s = stats._s # type: ignore
|
||||
|
||||
# also note that field naming is completely braindead
|
||||
# and only vaguely correlates with the pretty-printed table.
|
||||
# >>>> gc.get_stats(False)
|
||||
# Total memory consumed:
|
||||
# GC used: 8.7MB (peak: 39.0MB) # s.total_gc_memory, s.peak_memory
|
||||
# in arenas: 3.0MB # s.total_arena_memory
|
||||
# rawmalloced: 1.7MB # s.total_rawmalloced_memory
|
||||
# nursery: 4.0MB # s.nursery_size
|
||||
# raw assembler used: 31.0kB # s.jit_backend_used
|
||||
# -----------------------------
|
||||
# Total: 8.8MB # stats.memory_used_sum
|
||||
#
|
||||
# Total memory allocated:
|
||||
# GC allocated: 38.7MB (peak: 41.1MB) # s.total_allocated_memory, s.peak_allocated_memory
|
||||
# in arenas: 30.9MB # s.peak_arena_memory
|
||||
# rawmalloced: 4.1MB # s.peak_rawmalloced_memory
|
||||
# nursery: 4.0MB # s.nursery_size
|
||||
# raw assembler allocated: 1.0MB # s.jit_backend_allocated
|
||||
# -----------------------------
|
||||
# Total: 39.7MB # stats.memory_allocated_sum
|
||||
#
|
||||
# Total time spent in GC: 0.073 # s.total_gc_time
|
||||
|
||||
pypy_gc_time = CounterMetricFamily(
|
||||
"pypy_gc_time_seconds_total",
|
||||
"Total time spent in PyPy GC",
|
||||
labels=[],
|
||||
)
|
||||
pypy_gc_time.add_metric([], s.total_gc_time / 1000)
|
||||
yield pypy_gc_time
|
||||
|
||||
pypy_mem = GaugeMetricFamily(
|
||||
"pypy_memory_bytes",
|
||||
"Memory tracked by PyPy allocator",
|
||||
labels=["state", "class", "kind"],
|
||||
)
|
||||
# memory used by JIT assembler
|
||||
pypy_mem.add_metric(["used", "", "jit"], s.jit_backend_used)
|
||||
pypy_mem.add_metric(["allocated", "", "jit"], s.jit_backend_allocated)
|
||||
# memory used by GCed objects
|
||||
pypy_mem.add_metric(["used", "", "arenas"], s.total_arena_memory)
|
||||
pypy_mem.add_metric(["allocated", "", "arenas"], s.peak_arena_memory)
|
||||
pypy_mem.add_metric(["used", "", "rawmalloced"], s.total_rawmalloced_memory)
|
||||
pypy_mem.add_metric(["allocated", "", "rawmalloced"], s.peak_rawmalloced_memory)
|
||||
pypy_mem.add_metric(["used", "", "nursery"], s.nursery_size)
|
||||
pypy_mem.add_metric(["allocated", "", "nursery"], s.nursery_size)
|
||||
# totals
|
||||
pypy_mem.add_metric(["used", "totals", "gc"], s.total_gc_memory)
|
||||
pypy_mem.add_metric(["allocated", "totals", "gc"], s.total_allocated_memory)
|
||||
pypy_mem.add_metric(["used", "totals", "gc_peak"], s.peak_memory)
|
||||
pypy_mem.add_metric(["allocated", "totals", "gc_peak"], s.peak_allocated_memory)
|
||||
yield pypy_mem
|
||||
|
||||
|
||||
if running_on_pypy:
|
||||
REGISTRY.register(PyPyGCStats())
|
Loading…
Reference in New Issue