Limit how often GC happens by time. (#9902)

Synapse can be quite memory intensive, and unless care is taken to tune
the GC thresholds it can end up thrashing, causing noticable performance
problems for large servers. We fix this by limiting how often we GC a
given generation, regardless of current counts/thresholds.

This does not help with the reverse problem where the thresholds are set
too high, but that should only happen in situations where they've been
manually configured.

Adds a `gc_min_seconds_between` config option to override the defaults.

Fixes #9890.
This commit is contained in:
Erik Johnston 2021-05-05 16:53:45 +01:00 committed by GitHub
parent de8f0a03a3
commit 1fb9a2d0bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 63 additions and 3 deletions

1
changelog.d/9902.feature Normal file
View File

@ -0,0 +1 @@
Add limits to how often Synapse will GC, ensuring that large servers do not end up GC thrashing if `gc_thresholds` has not been correctly set.

View File

@ -152,6 +152,16 @@ presence:
# #
#gc_thresholds: [700, 10, 10] #gc_thresholds: [700, 10, 10]
# The minimum time in seconds between each GC for a generation, regardless of
# the GC thresholds. This ensures that we don't do GC too frequently.
#
# A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive
# generation 0 GCs, etc.
#
# Defaults to `[1s, 10s, 30s]`.
#
#gc_min_interval: [0.5s, 30s, 1m]
# Set the limit on the returned events in the timeline in the get # Set the limit on the returned events in the timeline in the get
# and sync operations. The default value is 100. -1 means no upper limit. # and sync operations. The default value is 100. -1 means no upper limit.
# #

View File

@ -455,6 +455,9 @@ def start(config_options):
synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts
if config.server.gc_seconds:
synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
hs = GenericWorkerServer( hs = GenericWorkerServer(
config.server_name, config.server_name,
config=config, config=config,

View File

@ -342,6 +342,9 @@ def setup(config_options):
events.USE_FROZEN_DICTS = config.use_frozen_dicts events.USE_FROZEN_DICTS = config.use_frozen_dicts
if config.server.gc_seconds:
synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds
hs = SynapseHomeServer( hs = SynapseHomeServer(
config.server_name, config.server_name,
config=config, config=config,

View File

@ -19,7 +19,7 @@ import logging
import os.path import os.path
import re import re
from textwrap import indent from textwrap import indent
from typing import Any, Dict, Iterable, List, Optional, Set from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
import attr import attr
import yaml import yaml
@ -572,6 +572,7 @@ class ServerConfig(Config):
_warn_if_webclient_configured(self.listeners) _warn_if_webclient_configured(self.listeners)
self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None)) self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None))
self.gc_seconds = self.read_gc_intervals(config.get("gc_min_interval", None))
@attr.s @attr.s
class LimitRemoteRoomsConfig: class LimitRemoteRoomsConfig:
@ -917,6 +918,16 @@ class ServerConfig(Config):
# #
#gc_thresholds: [700, 10, 10] #gc_thresholds: [700, 10, 10]
# The minimum time in seconds between each GC for a generation, regardless of
# the GC thresholds. This ensures that we don't do GC too frequently.
#
# A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive
# generation 0 GCs, etc.
#
# Defaults to `[1s, 10s, 30s]`.
#
#gc_min_interval: [0.5s, 30s, 1m]
# Set the limit on the returned events in the timeline in the get # Set the limit on the returned events in the timeline in the get
# and sync operations. The default value is 100. -1 means no upper limit. # and sync operations. The default value is 100. -1 means no upper limit.
# #
@ -1305,6 +1316,24 @@ class ServerConfig(Config):
help="Turn on the twisted telnet manhole service on the given port.", help="Turn on the twisted telnet manhole service on the given port.",
) )
def read_gc_intervals(self, durations) -> Optional[Tuple[float, float, float]]:
"""Reads the three durations for the GC min interval option, returning seconds."""
if durations is None:
return None
try:
if len(durations) != 3:
raise ValueError()
return (
self.parse_duration(durations[0]) / 1000,
self.parse_duration(durations[1]) / 1000,
self.parse_duration(durations[2]) / 1000,
)
except Exception:
raise ConfigError(
"Value of `gc_min_interval` must be a list of three durations if set"
)
def is_threepid_reserved(reserved_threepids, threepid): def is_threepid_reserved(reserved_threepids, threepid):
"""Check the threepid against the reserved threepid config """Check the threepid against the reserved threepid config

View File

@ -535,6 +535,13 @@ class ReactorLastSeenMetric:
REGISTRY.register(ReactorLastSeenMetric()) REGISTRY.register(ReactorLastSeenMetric())
# The minimum time in seconds between GCs for each generation, regardless of the current GC
# thresholds and counts.
MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0)
# The time (in seconds since the epoch) of the last time we did a GC for each generation.
_last_gc = [0.0, 0.0, 0.0]
def runUntilCurrentTimer(reactor, func): def runUntilCurrentTimer(reactor, func):
@functools.wraps(func) @functools.wraps(func)
@ -575,11 +582,16 @@ def runUntilCurrentTimer(reactor, func):
return ret return ret
# Check if we need to do a manual GC (since its been disabled), and do # Check if we need to do a manual GC (since its been disabled), and do
# one if necessary. # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may
# promote an object into gen 2, and we don't want to handle the same
# object multiple times.
threshold = gc.get_threshold() threshold = gc.get_threshold()
counts = gc.get_count() counts = gc.get_count()
for i in (2, 1, 0): for i in (2, 1, 0):
if threshold[i] < counts[i]: # We check if we need to do one based on a straightforward
# comparison between the threshold and count. We also do an extra
# check to make sure that we don't a GC too often.
if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]:
if i == 0: if i == 0:
logger.debug("Collecting gc %d", i) logger.debug("Collecting gc %d", i)
else: else:
@ -589,6 +601,8 @@ def runUntilCurrentTimer(reactor, func):
unreachable = gc.collect(i) unreachable = gc.collect(i)
end = time.time() end = time.time()
_last_gc[i] = end
gc_time.labels(i).observe(end - start) gc_time.labels(i).observe(end - start)
gc_unreachable.labels(i).set(unreachable) gc_unreachable.labels(i).set(unreachable)