diff --git a/changelog.d/9902.feature b/changelog.d/9902.feature new file mode 100644 index 0000000000..4d9f324d4e --- /dev/null +++ b/changelog.d/9902.feature @@ -0,0 +1 @@ +Add limits to how often Synapse will GC, ensuring that large servers do not end up GC thrashing if `gc_thresholds` has not been correctly set. diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index d013725cdc..f469d6e54f 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -152,6 +152,16 @@ presence: # #gc_thresholds: [700, 10, 10] +# The minimum time in seconds between each GC for a generation, regardless of +# the GC thresholds. This ensures that we don't do GC too frequently. +# +# A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive +# generation 0 GCs, etc. +# +# Defaults to `[1s, 10s, 30s]`. +# +#gc_min_interval: [0.5s, 30s, 1m] + # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. # diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py index 1a15ceee81..a3fe9a3f38 100644 --- a/synapse/app/generic_worker.py +++ b/synapse/app/generic_worker.py @@ -455,6 +455,9 @@ def start(config_options): synapse.events.USE_FROZEN_DICTS = config.use_frozen_dicts + if config.server.gc_seconds: + synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds + hs = GenericWorkerServer( config.server_name, config=config, diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index 8e78134bbe..6a823da10d 100644 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -342,6 +342,9 @@ def setup(config_options): events.USE_FROZEN_DICTS = config.use_frozen_dicts + if config.server.gc_seconds: + synapse.metrics.MIN_TIME_BETWEEN_GCS = config.server.gc_seconds + hs = SynapseHomeServer( config.server_name, config=config, diff --git a/synapse/config/server.py b/synapse/config/server.py index 21ca7b33e3..c290a35a92 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -19,7 +19,7 @@ import logging import os.path import re from textwrap import indent -from typing import Any, Dict, Iterable, List, Optional, Set +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple import attr import yaml @@ -572,6 +572,7 @@ class ServerConfig(Config): _warn_if_webclient_configured(self.listeners) self.gc_thresholds = read_gc_thresholds(config.get("gc_thresholds", None)) + self.gc_seconds = self.read_gc_intervals(config.get("gc_min_interval", None)) @attr.s class LimitRemoteRoomsConfig: @@ -917,6 +918,16 @@ class ServerConfig(Config): # #gc_thresholds: [700, 10, 10] + # The minimum time in seconds between each GC for a generation, regardless of + # the GC thresholds. This ensures that we don't do GC too frequently. + # + # A value of `[1s, 10s, 30s]` indicates that a second must pass between consecutive + # generation 0 GCs, etc. + # + # Defaults to `[1s, 10s, 30s]`. + # + #gc_min_interval: [0.5s, 30s, 1m] + # Set the limit on the returned events in the timeline in the get # and sync operations. The default value is 100. -1 means no upper limit. # @@ -1305,6 +1316,24 @@ class ServerConfig(Config): help="Turn on the twisted telnet manhole service on the given port.", ) + def read_gc_intervals(self, durations) -> Optional[Tuple[float, float, float]]: + """Reads the three durations for the GC min interval option, returning seconds.""" + if durations is None: + return None + + try: + if len(durations) != 3: + raise ValueError() + return ( + self.parse_duration(durations[0]) / 1000, + self.parse_duration(durations[1]) / 1000, + self.parse_duration(durations[2]) / 1000, + ) + except Exception: + raise ConfigError( + "Value of `gc_min_interval` must be a list of three durations if set" + ) + def is_threepid_reserved(reserved_threepids, threepid): """Check the threepid against the reserved threepid config diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 31b7b3c256..e671da26d5 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -535,6 +535,13 @@ class ReactorLastSeenMetric: REGISTRY.register(ReactorLastSeenMetric()) +# The minimum time in seconds between GCs for each generation, regardless of the current GC +# thresholds and counts. +MIN_TIME_BETWEEN_GCS = (1.0, 10.0, 30.0) + +# The time (in seconds since the epoch) of the last time we did a GC for each generation. +_last_gc = [0.0, 0.0, 0.0] + def runUntilCurrentTimer(reactor, func): @functools.wraps(func) @@ -575,11 +582,16 @@ def runUntilCurrentTimer(reactor, func): return ret # Check if we need to do a manual GC (since its been disabled), and do - # one if necessary. + # one if necessary. Note we go in reverse order as e.g. a gen 1 GC may + # promote an object into gen 2, and we don't want to handle the same + # object multiple times. threshold = gc.get_threshold() counts = gc.get_count() for i in (2, 1, 0): - if threshold[i] < counts[i]: + # We check if we need to do one based on a straightforward + # comparison between the threshold and count. We also do an extra + # check to make sure that we don't a GC too often. + if threshold[i] < counts[i] and MIN_TIME_BETWEEN_GCS[i] < end - _last_gc[i]: if i == 0: logger.debug("Collecting gc %d", i) else: @@ -589,6 +601,8 @@ def runUntilCurrentTimer(reactor, func): unreachable = gc.collect(i) end = time.time() + _last_gc[i] = end + gc_time.labels(i).observe(end - start) gc_unreachable.labels(i).set(unreachable)