Time out busy presence status & test multi-device busy (#16174)

Add a (long) timeout to when a "busy" device is considered not online.
This does *not* match MSC3026, but is a reasonable thing for an
implementation to do.

Expands tests for the (unstable) busy presence with multiple devices.
This commit is contained in:
Patrick Cloke 2023-09-05 10:39:38 -04:00 committed by GitHub
parent ea75346f6a
commit 8b5013dcbc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 120 additions and 4 deletions

1
changelog.d/16174.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a long-standing bug where multi-device accounts could cause high load due to presence.

View File

@ -155,6 +155,8 @@ LAST_ACTIVE_GRANULARITY = 60 * 1000
# How long to wait until a new /events or /sync request before assuming
# the client has gone.
SYNC_ONLINE_TIMEOUT = 30 * 1000
# Busy status waits longer, but does eventually go offline.
BUSY_ONLINE_TIMEOUT = 60 * 60 * 1000
# How long to wait before marking the user as idle. Compared against last active
IDLE_TIMER = 5 * 60 * 1000
@ -2066,7 +2068,15 @@ def handle_timeout(
device_state.last_sync_ts, device_state.last_active_ts
)
if now - sync_or_active > SYNC_ONLINE_TIMEOUT:
# Implementations aren't meant to timeout a device with a busy
# state, but it needs to timeout *eventually* or else the user
# will be stuck in that state.
online_timeout = (
BUSY_ONLINE_TIMEOUT
if device_state.state == PresenceState.BUSY
else SYNC_ONLINE_TIMEOUT
)
if now - sync_or_active > online_timeout:
# Mark the device as going offline.
offline_devices.append(device_id)
device_changed = True
@ -2166,6 +2176,13 @@ def handle_update(
new_state = new_state.copy_and_replace(last_federation_update_ts=now)
federation_ping = True
if new_state.state == PresenceState.BUSY:
wheel_timer.insert(
now=now,
obj=user_id,
then=new_state.last_user_sync_ts + BUSY_ONLINE_TIMEOUT,
)
else:
wheel_timer.insert(
now=now,

View File

@ -26,6 +26,7 @@ from synapse.api.room_versions import KNOWN_ROOM_VERSIONS
from synapse.events.builder import EventBuilder
from synapse.federation.sender import FederationSender
from synapse.handlers.presence import (
BUSY_ONLINE_TIMEOUT,
EXTERNAL_PROCESS_EXPIRY,
FEDERATION_PING_INTERVAL,
FEDERATION_TIMEOUT,
@ -912,6 +913,13 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
for cases in [
# If both devices have the same state, online should eventually idle.
# Otherwise, the state doesn't change.
(
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.ONLINE,
PresenceState.ONLINE,
@ -933,7 +941,29 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
PresenceState.OFFLINE,
PresenceState.OFFLINE,
),
# If the second device has a "lower" state it should fallback to it.
# If the second device has a "lower" state it should fallback to it,
# except for "busy" which overrides.
(
PresenceState.BUSY,
PresenceState.ONLINE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.BUSY,
PresenceState.UNAVAILABLE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.BUSY,
PresenceState.OFFLINE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.ONLINE,
PresenceState.UNAVAILABLE,
@ -956,6 +986,27 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
PresenceState.UNAVAILABLE,
),
# If the second device has a "higher" state it should override.
(
PresenceState.ONLINE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.UNAVAILABLE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.OFFLINE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.UNAVAILABLE,
PresenceState.ONLINE,
@ -1114,6 +1165,12 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
for workers in (False, True)
for cases in [
# If both devices have the same state, nothing exciting should happen.
(
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.ONLINE,
PresenceState.ONLINE,
@ -1132,7 +1189,26 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
PresenceState.OFFLINE,
PresenceState.OFFLINE,
),
# If the second device has a "lower" state it should fallback to it.
# If the second device has a "lower" state it should fallback to it,
# except for "busy" which overrides.
(
PresenceState.BUSY,
PresenceState.ONLINE,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.BUSY,
PresenceState.UNAVAILABLE,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.BUSY,
PresenceState.OFFLINE,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.ONLINE,
PresenceState.UNAVAILABLE,
@ -1152,6 +1228,24 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
PresenceState.OFFLINE,
),
# If the second device has a "higher" state it should override.
(
PresenceState.ONLINE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.UNAVAILABLE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.OFFLINE,
PresenceState.BUSY,
PresenceState.BUSY,
PresenceState.BUSY,
),
(
PresenceState.UNAVAILABLE,
PresenceState.ONLINE,
@ -1266,7 +1360,11 @@ class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
# 8. Advance such that the second device should be discarded (the sync timeout),
# then pump so _handle_timeouts function to called.
self.reactor.advance(SYNC_ONLINE_TIMEOUT / 1000)
if dev_1_state == PresenceState.BUSY or dev_2_state == PresenceState.BUSY:
timeout = BUSY_ONLINE_TIMEOUT
else:
timeout = SYNC_ONLINE_TIMEOUT
self.reactor.advance(timeout / 1000)
self.reactor.pump([5])
# 9. There are no more devices, should be offline.