Batch process handling state groups
This commit is contained in:
parent
67a1e315cc
commit
47a9da28ca
|
@ -37,6 +37,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process
|
|||
from synapse.storage.background_updates import BackgroundUpdateStore
|
||||
from synapse.storage.event_federation import EventFederationStore
|
||||
from synapse.storage.events_worker import EventsWorkerStore
|
||||
from synapse.storage.state import StateGroupWorkerStore
|
||||
from synapse.types import RoomStreamToken, get_domain_from_id
|
||||
from synapse.util.async_helpers import ObservableDeferred
|
||||
from synapse.util.caches.descriptors import cached, cachedInlineCallbacks
|
||||
|
@ -203,7 +204,8 @@ def _retry_on_integrity_error(func):
|
|||
|
||||
# inherits from EventFederationStore so that we can call _update_backward_extremities
|
||||
# and _handle_mult_prev_events (though arguably those could both be moved in here)
|
||||
class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore):
|
||||
class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore,
|
||||
BackgroundUpdateStore):
|
||||
EVENT_ORIGIN_SERVER_TS_NAME = "event_origin_server_ts"
|
||||
EVENT_FIELDS_SENDER_URL_UPDATE_NAME = "event_fields_sender_url"
|
||||
|
||||
|
@ -1995,70 +1997,29 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore
|
|||
|
||||
logger.info("[purge] finding redundant state groups")
|
||||
|
||||
# Get all state groups that are only referenced by events that are
|
||||
# to be deleted.
|
||||
# This works by first getting state groups that we may want to delete,
|
||||
# joining against event_to_state_groups to get events that use that
|
||||
# state group, then left joining against events_to_purge again. Any
|
||||
# state group where the left join produce *no nulls* are referenced
|
||||
# only by events that are going to be purged.
|
||||
# Get all state groups that are referenced by events that are to be
|
||||
# deleted. We then go and check if they are referenced by other events
|
||||
# or state groups, and if not we delete them.
|
||||
txn.execute("""
|
||||
SELECT state_group FROM
|
||||
(
|
||||
SELECT DISTINCT state_group FROM events_to_purge
|
||||
INNER JOIN event_to_state_groups USING (event_id)
|
||||
) AS sp
|
||||
INNER JOIN event_to_state_groups USING (state_group)
|
||||
LEFT JOIN events_to_purge AS ep USING (event_id)
|
||||
GROUP BY state_group
|
||||
HAVING SUM(CASE WHEN ep.event_id IS NULL THEN 1 ELSE 0 END) = 0
|
||||
""")
|
||||
|
||||
state_rows = txn.fetchall()
|
||||
logger.info("[purge] found %i redundant state groups", len(state_rows))
|
||||
|
||||
# make a set of the redundant state groups, so that we can look them up
|
||||
# efficiently
|
||||
state_groups_to_delete = set([sg for sg, in state_rows])
|
||||
|
||||
# Now we get all the state groups that rely on these state groups
|
||||
logger.info("[purge] finding state groups which depend on redundant"
|
||||
" state groups")
|
||||
remaining_state_groups = []
|
||||
unreferenced_state_groups = 0
|
||||
for i in range(0, len(state_rows), 100):
|
||||
chunk = [sg for sg, in state_rows[i:i + 100]]
|
||||
# look for state groups whose prev_state_group is one we are about
|
||||
# to delete
|
||||
rows = self._simple_select_many_txn(
|
||||
txn,
|
||||
table="state_group_edges",
|
||||
column="prev_state_group",
|
||||
iterable=chunk,
|
||||
retcols=["state_group"],
|
||||
keyvalues={},
|
||||
referenced_state_groups = set(sg for sg, in txn)
|
||||
logger.info(
|
||||
"[purge] found %i referenced state groups",
|
||||
len(referenced_state_groups),
|
||||
)
|
||||
|
||||
for row in rows:
|
||||
sg = row["state_group"]
|
||||
logger.info("[purge] finding state groups that can be deleted")
|
||||
|
||||
if sg in state_groups_to_delete:
|
||||
# exclude state groups we are about to delete: no point in
|
||||
# updating them
|
||||
continue
|
||||
|
||||
if not self._is_state_group_referenced(txn, sg):
|
||||
# Let's also delete unreferenced state groups while we're
|
||||
# here, since otherwise we'd need to de-delta them
|
||||
state_groups_to_delete.add(sg)
|
||||
unreferenced_state_groups += 1
|
||||
continue
|
||||
|
||||
remaining_state_groups.append(sg)
|
||||
state_groups_to_delete, remaining_state_groups = self._find_unreferenced_groups(
|
||||
txn, referenced_state_groups,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"[purge] found %i extra unreferenced state groups to delete",
|
||||
unreferenced_state_groups,
|
||||
"[purge] found %i state groups to delete",
|
||||
len(state_groups_to_delete),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
|
@ -2109,11 +2070,11 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore
|
|||
logger.info("[purge] removing redundant state groups")
|
||||
txn.executemany(
|
||||
"DELETE FROM state_groups_state WHERE state_group = ?",
|
||||
state_rows
|
||||
((sg,) for sg in state_groups_to_delete),
|
||||
)
|
||||
txn.executemany(
|
||||
"DELETE FROM state_groups WHERE id = ?",
|
||||
state_rows
|
||||
((sg,) for sg in state_groups_to_delete),
|
||||
)
|
||||
|
||||
logger.info("[purge] removing events from event_to_state_groups")
|
||||
|
|
|
@ -1041,55 +1041,85 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
|
|||
|
||||
return count
|
||||
|
||||
def _is_state_group_referenced(self, txn, state_group):
|
||||
"""Checks if a given state group is referenced, or is safe to delete.
|
||||
def _find_unreferenced_groups(self, txn, state_groups):
|
||||
"""Used when purging history to figure out which state groups can be
|
||||
deleted and which need to be de-delta'ed (due to one of its prev groups
|
||||
being scheduled for deletion).
|
||||
|
||||
A state group is referenced if it or any of its descendants are
|
||||
pointed at by an event. (A descendant is a state_group whose chain of
|
||||
prev_groups includes the given state_group.)
|
||||
Args:
|
||||
txn
|
||||
state_groups (set[int]): Set of state groups referenced by events
|
||||
that are going to be deleted.
|
||||
|
||||
Returns:
|
||||
tuple[set[int], set[int]]: The set of state groups that can be
|
||||
deleted and the set of state groups that need to be de-delta'ed
|
||||
"""
|
||||
# Graph of state group -> previous group
|
||||
graph = {}
|
||||
|
||||
# We check this by doing a depth first search to look for any
|
||||
# descendant referenced by `event_to_state_groups`.
|
||||
# Set of events that we have found to be referenced by events
|
||||
referenced_groups = set()
|
||||
|
||||
# State groups we need to check, contains state groups that are
|
||||
# descendants of `state_group`
|
||||
state_groups_to_search = [state_group]
|
||||
# Set of state groups we've already seen
|
||||
state_groups_seen = set(state_groups)
|
||||
|
||||
# Set of state groups we've already checked
|
||||
state_groups_searched = set()
|
||||
# Set of state groups to handle next.
|
||||
next_to_search = set(state_groups)
|
||||
while next_to_search:
|
||||
# We bound size of groups we're looking up at once, to stop the
|
||||
# SQL query getting too big
|
||||
if len(next_to_search) < 100:
|
||||
current_search = next_to_search
|
||||
next_to_search = set()
|
||||
else:
|
||||
lst = list(next_to_search)
|
||||
current_search = set(lst[:100])
|
||||
next_to_search = set(lst[100:])
|
||||
|
||||
while state_groups_to_search:
|
||||
state_group = state_groups_to_search.pop() # Next state group to check
|
||||
# Check if state groups are referenced
|
||||
sql = """
|
||||
SELECT state_group, count(*) FROM event_to_state_groups
|
||||
LEFT JOIN events_to_purge AS ep USING (event_id)
|
||||
WHERE state_group IN (%s) AND ep.event_id IS NULL
|
||||
GROUP BY state_group
|
||||
""" % (",".join("?" for _ in current_search),)
|
||||
txn.execute(sql, list(current_search))
|
||||
|
||||
is_referenced = self._simple_select_one_onecol_txn(
|
||||
txn,
|
||||
table="event_to_state_groups",
|
||||
keyvalues={"state_group": state_group},
|
||||
retcol="event_id",
|
||||
allow_none=True,
|
||||
)
|
||||
if is_referenced:
|
||||
# A descendant is referenced by event_to_state_groups, so
|
||||
# original state group is referenced.
|
||||
return True
|
||||
referenced = set(sg for sg, cnt in txn if cnt > 0)
|
||||
referenced_groups |= referenced
|
||||
|
||||
state_groups_searched.add(state_group)
|
||||
# We don't continue iterating up the state group graphs for state
|
||||
# groups that are referenced.
|
||||
current_search -= referenced
|
||||
|
||||
# Find all children of current state group and add to search
|
||||
references = self._simple_select_onecol_txn(
|
||||
rows = self._simple_select_many_txn(
|
||||
txn,
|
||||
table="state_group_edges",
|
||||
keyvalues={"prev_state_group": state_group},
|
||||
retcol="state_group",
|
||||
column="prev_state_group",
|
||||
iterable=current_search,
|
||||
keyvalues={},
|
||||
retcols=("prev_state_group", "state_group",),
|
||||
)
|
||||
state_groups_to_search.extend(references)
|
||||
|
||||
# Lets be paranoid and check for cycles
|
||||
if state_groups_searched.intersection(references):
|
||||
raise Exception("State group %s has cyclic dependency", state_group)
|
||||
next_to_search.update(row["state_group"] for row in rows)
|
||||
# We don't bother re-handling groups we've already seen
|
||||
next_to_search -= state_groups_seen
|
||||
state_groups_seen |= next_to_search
|
||||
|
||||
return False
|
||||
for row in rows:
|
||||
# Note: Each state group can have at most one prev group
|
||||
graph[row["state_group"]] = row["prev_state_group"]
|
||||
|
||||
to_delete = state_groups_seen - referenced_groups
|
||||
|
||||
to_dedelta = set()
|
||||
for sg in referenced_groups:
|
||||
prev_sg = graph.get(sg)
|
||||
if prev_sg and prev_sg in to_delete:
|
||||
to_dedelta.add(sg)
|
||||
|
||||
return to_delete, to_dedelta
|
||||
|
||||
|
||||
class StateStore(StateGroupWorkerStore, BackgroundUpdateStore):
|
||||
|
|
Loading…
Reference in New Issue