diff --git a/changelog.d/16174.bugfix b/changelog.d/16174.bugfix new file mode 100644 index 0000000000..83649cf42a --- /dev/null +++ b/changelog.d/16174.bugfix @@ -0,0 +1 @@ +Fix a long-standing bug where multi-device accounts could cause high load due to presence. diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index 80190838b7..a4b05b72e7 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -155,6 +155,8 @@ LAST_ACTIVE_GRANULARITY = 60 * 1000 # How long to wait until a new /events or /sync request before assuming # the client has gone. SYNC_ONLINE_TIMEOUT = 30 * 1000 +# Busy status waits longer, but does eventually go offline. +BUSY_ONLINE_TIMEOUT = 60 * 60 * 1000 # How long to wait before marking the user as idle. Compared against last active IDLE_TIMER = 5 * 60 * 1000 @@ -2066,7 +2068,15 @@ def handle_timeout( device_state.last_sync_ts, device_state.last_active_ts ) - if now - sync_or_active > SYNC_ONLINE_TIMEOUT: + # Implementations aren't meant to timeout a device with a busy + # state, but it needs to timeout *eventually* or else the user + # will be stuck in that state. + online_timeout = ( + BUSY_ONLINE_TIMEOUT + if device_state.state == PresenceState.BUSY + else SYNC_ONLINE_TIMEOUT + ) + if now - sync_or_active > online_timeout: # Mark the device as going offline. offline_devices.append(device_id) device_changed = True @@ -2166,6 +2176,13 @@ def handle_update( new_state = new_state.copy_and_replace(last_federation_update_ts=now) federation_ping = True + if new_state.state == PresenceState.BUSY: + wheel_timer.insert( + now=now, + obj=user_id, + then=new_state.last_user_sync_ts + BUSY_ONLINE_TIMEOUT, + ) + else: wheel_timer.insert( now=now, diff --git a/tests/handlers/test_presence.py b/tests/handlers/test_presence.py index 914415740a..638787b029 100644 --- a/tests/handlers/test_presence.py +++ b/tests/handlers/test_presence.py @@ -26,6 +26,7 @@ from synapse.api.room_versions import KNOWN_ROOM_VERSIONS from synapse.events.builder import EventBuilder from synapse.federation.sender import FederationSender from synapse.handlers.presence import ( + BUSY_ONLINE_TIMEOUT, EXTERNAL_PROCESS_EXPIRY, FEDERATION_PING_INTERVAL, FEDERATION_TIMEOUT, @@ -912,6 +913,13 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): for cases in [ # If both devices have the same state, online should eventually idle. # Otherwise, the state doesn't change. + ( + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), ( PresenceState.ONLINE, PresenceState.ONLINE, @@ -933,7 +941,29 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): PresenceState.OFFLINE, PresenceState.OFFLINE, ), - # If the second device has a "lower" state it should fallback to it. + # If the second device has a "lower" state it should fallback to it, + # except for "busy" which overrides. + ( + PresenceState.BUSY, + PresenceState.ONLINE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.BUSY, + PresenceState.UNAVAILABLE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.BUSY, + PresenceState.OFFLINE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), ( PresenceState.ONLINE, PresenceState.UNAVAILABLE, @@ -956,6 +986,27 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): PresenceState.UNAVAILABLE, ), # If the second device has a "higher" state it should override. + ( + PresenceState.ONLINE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.UNAVAILABLE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.OFFLINE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), ( PresenceState.UNAVAILABLE, PresenceState.ONLINE, @@ -1114,6 +1165,12 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): for workers in (False, True) for cases in [ # If both devices have the same state, nothing exciting should happen. + ( + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), ( PresenceState.ONLINE, PresenceState.ONLINE, @@ -1132,7 +1189,26 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): PresenceState.OFFLINE, PresenceState.OFFLINE, ), - # If the second device has a "lower" state it should fallback to it. + # If the second device has a "lower" state it should fallback to it, + # except for "busy" which overrides. + ( + PresenceState.BUSY, + PresenceState.ONLINE, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.BUSY, + PresenceState.UNAVAILABLE, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.BUSY, + PresenceState.OFFLINE, + PresenceState.BUSY, + PresenceState.BUSY, + ), ( PresenceState.ONLINE, PresenceState.UNAVAILABLE, @@ -1152,6 +1228,24 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): PresenceState.OFFLINE, ), # If the second device has a "higher" state it should override. + ( + PresenceState.ONLINE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.UNAVAILABLE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), + ( + PresenceState.OFFLINE, + PresenceState.BUSY, + PresenceState.BUSY, + PresenceState.BUSY, + ), ( PresenceState.UNAVAILABLE, PresenceState.ONLINE, @@ -1266,7 +1360,11 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase): # 8. Advance such that the second device should be discarded (the sync timeout), # then pump so _handle_timeouts function to called. - self.reactor.advance(SYNC_ONLINE_TIMEOUT / 1000) + if dev_1_state == PresenceState.BUSY or dev_2_state == PresenceState.BUSY: + timeout = BUSY_ONLINE_TIMEOUT + else: + timeout = SYNC_ONLINE_TIMEOUT + self.reactor.advance(timeout / 1000) self.reactor.pump([5]) # 9. There are no more devices, should be offline.