Fix races in room stats (and other) updates. (#6187)
Hopefully this will fix the occasional failures we were seeing in the room directory. The problem was that events are not necessarily persisted (and `current_state_delta_stream` updated) in the same order as their stream_id. So for instance current_state_delta 9 might be persisted *before* current_state_delta 8. Then, when the room stats saw stream_id 9, it assumed it had done everything up to 9, and never came back to do stream_id 8. We can solve this easily by only processing up to the stream_id where we know all events have been persisted.
This commit is contained in:
parent
562b4e51dd
commit
a139420a3c
|
@ -0,0 +1 @@
|
||||||
|
Fix occasional missed updates in the room and user directories.
|
|
@ -803,17 +803,25 @@ class PresenceHandler(object):
|
||||||
# Loop round handling deltas until we're up to date
|
# Loop round handling deltas until we're up to date
|
||||||
while True:
|
while True:
|
||||||
with Measure(self.clock, "presence_delta"):
|
with Measure(self.clock, "presence_delta"):
|
||||||
deltas = yield self.store.get_current_state_deltas(self._event_pos)
|
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
|
||||||
if not deltas:
|
if self._event_pos == room_max_stream_ordering:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Processing presence stats %s->%s",
|
||||||
|
self._event_pos,
|
||||||
|
room_max_stream_ordering,
|
||||||
|
)
|
||||||
|
max_pos, deltas = yield self.store.get_current_state_deltas(
|
||||||
|
self._event_pos, room_max_stream_ordering
|
||||||
|
)
|
||||||
yield self._handle_state_delta(deltas)
|
yield self._handle_state_delta(deltas)
|
||||||
|
|
||||||
self._event_pos = deltas[-1]["stream_id"]
|
self._event_pos = max_pos
|
||||||
|
|
||||||
# Expose current event processing position to prometheus
|
# Expose current event processing position to prometheus
|
||||||
synapse.metrics.event_processing_positions.labels("presence").set(
|
synapse.metrics.event_processing_positions.labels("presence").set(
|
||||||
self._event_pos
|
max_pos
|
||||||
)
|
)
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
|
|
|
@ -87,21 +87,23 @@ class StatsHandler(StateDeltasHandler):
|
||||||
# Be sure to read the max stream_ordering *before* checking if there are any outstanding
|
# Be sure to read the max stream_ordering *before* checking if there are any outstanding
|
||||||
# deltas, since there is otherwise a chance that we could miss updates which arrive
|
# deltas, since there is otherwise a chance that we could miss updates which arrive
|
||||||
# after we check the deltas.
|
# after we check the deltas.
|
||||||
room_max_stream_ordering = yield self.store.get_room_max_stream_ordering()
|
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
|
||||||
if self.pos == room_max_stream_ordering:
|
if self.pos == room_max_stream_ordering:
|
||||||
break
|
break
|
||||||
|
|
||||||
deltas = yield self.store.get_current_state_deltas(self.pos)
|
logger.debug(
|
||||||
|
"Processing room stats %s->%s", self.pos, room_max_stream_ordering
|
||||||
|
)
|
||||||
|
max_pos, deltas = yield self.store.get_current_state_deltas(
|
||||||
|
self.pos, room_max_stream_ordering
|
||||||
|
)
|
||||||
|
|
||||||
if deltas:
|
if deltas:
|
||||||
logger.debug("Handling %d state deltas", len(deltas))
|
logger.debug("Handling %d state deltas", len(deltas))
|
||||||
room_deltas, user_deltas = yield self._handle_deltas(deltas)
|
room_deltas, user_deltas = yield self._handle_deltas(deltas)
|
||||||
|
|
||||||
max_pos = deltas[-1]["stream_id"]
|
|
||||||
else:
|
else:
|
||||||
room_deltas = {}
|
room_deltas = {}
|
||||||
user_deltas = {}
|
user_deltas = {}
|
||||||
max_pos = room_max_stream_ordering
|
|
||||||
|
|
||||||
# Then count deltas for total_events and total_event_bytes.
|
# Then count deltas for total_events and total_event_bytes.
|
||||||
room_count, user_count = yield self.store.get_changes_room_total_events_and_bytes(
|
room_count, user_count = yield self.store.get_changes_room_total_events_and_bytes(
|
||||||
|
|
|
@ -138,21 +138,28 @@ class UserDirectoryHandler(StateDeltasHandler):
|
||||||
# Loop round handling deltas until we're up to date
|
# Loop round handling deltas until we're up to date
|
||||||
while True:
|
while True:
|
||||||
with Measure(self.clock, "user_dir_delta"):
|
with Measure(self.clock, "user_dir_delta"):
|
||||||
deltas = yield self.store.get_current_state_deltas(self.pos)
|
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
|
||||||
if not deltas:
|
if self.pos == room_max_stream_ordering:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
"Processing user stats %s->%s", self.pos, room_max_stream_ordering
|
||||||
|
)
|
||||||
|
max_pos, deltas = yield self.store.get_current_state_deltas(
|
||||||
|
self.pos, room_max_stream_ordering
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Handling %d state deltas", len(deltas))
|
logger.info("Handling %d state deltas", len(deltas))
|
||||||
yield self._handle_deltas(deltas)
|
yield self._handle_deltas(deltas)
|
||||||
|
|
||||||
self.pos = deltas[-1]["stream_id"]
|
self.pos = max_pos
|
||||||
|
|
||||||
# Expose current event processing position to prometheus
|
# Expose current event processing position to prometheus
|
||||||
synapse.metrics.event_processing_positions.labels("user_dir").set(
|
synapse.metrics.event_processing_positions.labels("user_dir").set(
|
||||||
self.pos
|
max_pos
|
||||||
)
|
)
|
||||||
|
|
||||||
yield self.store.update_user_directory_stream_pos(self.pos)
|
yield self.store.update_user_directory_stream_pos(max_pos)
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def _handle_deltas(self, deltas):
|
def _handle_deltas(self, deltas):
|
||||||
|
|
|
@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class StateDeltasStore(SQLBaseStore):
|
class StateDeltasStore(SQLBaseStore):
|
||||||
def get_current_state_deltas(self, prev_stream_id):
|
def get_current_state_deltas(self, prev_stream_id: int, max_stream_id: int):
|
||||||
"""Fetch a list of room state changes since the given stream id
|
"""Fetch a list of room state changes since the given stream id
|
||||||
|
|
||||||
Each entry in the result contains the following fields:
|
Each entry in the result contains the following fields:
|
||||||
|
@ -36,15 +36,27 @@ class StateDeltasStore(SQLBaseStore):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prev_stream_id (int): point to get changes since (exclusive)
|
prev_stream_id (int): point to get changes since (exclusive)
|
||||||
|
max_stream_id (int): the point that we know has been correctly persisted
|
||||||
|
- ie, an upper limit to return changes from.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Deferred[list[dict]]: results
|
Deferred[tuple[int, list[dict]]: A tuple consisting of:
|
||||||
|
- the stream id which these results go up to
|
||||||
|
- list of current_state_delta_stream rows. If it is empty, we are
|
||||||
|
up to date.
|
||||||
"""
|
"""
|
||||||
prev_stream_id = int(prev_stream_id)
|
prev_stream_id = int(prev_stream_id)
|
||||||
|
|
||||||
|
# check we're not going backwards
|
||||||
|
assert prev_stream_id <= max_stream_id
|
||||||
|
|
||||||
if not self._curr_state_delta_stream_cache.has_any_entity_changed(
|
if not self._curr_state_delta_stream_cache.has_any_entity_changed(
|
||||||
prev_stream_id
|
prev_stream_id
|
||||||
):
|
):
|
||||||
return []
|
# if the CSDs haven't changed between prev_stream_id and now, we
|
||||||
|
# know for certain that they haven't changed between prev_stream_id and
|
||||||
|
# max_stream_id.
|
||||||
|
return max_stream_id, []
|
||||||
|
|
||||||
def get_current_state_deltas_txn(txn):
|
def get_current_state_deltas_txn(txn):
|
||||||
# First we calculate the max stream id that will give us less than
|
# First we calculate the max stream id that will give us less than
|
||||||
|
@ -54,21 +66,29 @@ class StateDeltasStore(SQLBaseStore):
|
||||||
sql = """
|
sql = """
|
||||||
SELECT stream_id, count(*)
|
SELECT stream_id, count(*)
|
||||||
FROM current_state_delta_stream
|
FROM current_state_delta_stream
|
||||||
WHERE stream_id > ?
|
WHERE stream_id > ? AND stream_id <= ?
|
||||||
GROUP BY stream_id
|
GROUP BY stream_id
|
||||||
ORDER BY stream_id ASC
|
ORDER BY stream_id ASC
|
||||||
LIMIT 100
|
LIMIT 100
|
||||||
"""
|
"""
|
||||||
txn.execute(sql, (prev_stream_id,))
|
txn.execute(sql, (prev_stream_id, max_stream_id))
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
max_stream_id = prev_stream_id
|
|
||||||
for max_stream_id, count in txn:
|
for stream_id, count in txn:
|
||||||
total += count
|
total += count
|
||||||
if total > 100:
|
if total > 100:
|
||||||
# We arbitarily limit to 100 entries to ensure we don't
|
# We arbitarily limit to 100 entries to ensure we don't
|
||||||
# select toooo many.
|
# select toooo many.
|
||||||
|
logger.debug(
|
||||||
|
"Clipping current_state_delta_stream rows to stream_id %i",
|
||||||
|
stream_id,
|
||||||
|
)
|
||||||
|
clipped_stream_id = stream_id
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
# if there's no problem, we may as well go right up to the max_stream_id
|
||||||
|
clipped_stream_id = max_stream_id
|
||||||
|
|
||||||
# Now actually get the deltas
|
# Now actually get the deltas
|
||||||
sql = """
|
sql = """
|
||||||
|
@ -77,8 +97,8 @@ class StateDeltasStore(SQLBaseStore):
|
||||||
WHERE ? < stream_id AND stream_id <= ?
|
WHERE ? < stream_id AND stream_id <= ?
|
||||||
ORDER BY stream_id ASC
|
ORDER BY stream_id ASC
|
||||||
"""
|
"""
|
||||||
txn.execute(sql, (prev_stream_id, max_stream_id))
|
txn.execute(sql, (prev_stream_id, clipped_stream_id))
|
||||||
return self.cursor_to_dict(txn)
|
return clipped_stream_id, self.cursor_to_dict(txn)
|
||||||
|
|
||||||
return self.runInteraction(
|
return self.runInteraction(
|
||||||
"get_current_state_deltas", get_current_state_deltas_txn
|
"get_current_state_deltas", get_current_state_deltas_txn
|
||||||
|
|
|
@ -139,7 +139,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
||||||
defer.succeed(1)
|
defer.succeed(1)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.datastore.get_current_state_deltas.return_value = None
|
self.datastore.get_current_state_deltas.return_value = (0, None)
|
||||||
|
|
||||||
self.datastore.get_to_device_stream_token = lambda: 0
|
self.datastore.get_to_device_stream_token = lambda: 0
|
||||||
self.datastore.get_new_device_msgs_for_remote = lambda *args, **kargs: ([], 0)
|
self.datastore.get_new_device_msgs_for_remote = lambda *args, **kargs: ([], 0)
|
||||||
|
|
|
@ -62,7 +62,7 @@ class UserRegisterTestCase(unittest.HomeserverTestCase):
|
||||||
self.device_handler.check_device_registered = Mock(return_value="FAKE")
|
self.device_handler.check_device_registered = Mock(return_value="FAKE")
|
||||||
|
|
||||||
self.datastore = Mock(return_value=Mock())
|
self.datastore = Mock(return_value=Mock())
|
||||||
self.datastore.get_current_state_deltas = Mock(return_value=[])
|
self.datastore.get_current_state_deltas = Mock(return_value=(0, []))
|
||||||
|
|
||||||
self.secrets = Mock()
|
self.secrets = Mock()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue