Speed up deleting device messages (#16643)

Keeping track of a lower bound of stream ID where we've deleted everything below makes the queries much faster. Otherwise, every time we scan for rows to delete we'd re-scan across all the rows that have previously deleted (until the next table VACUUM).
This commit is contained in:
Erik Johnston 2023-11-16 15:19:35 +00:00 committed by GitHub
parent 1b238e8837
commit 3e8531d3ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 29 deletions

1
changelog.d/16643.misc Normal file
View File

@ -0,0 +1 @@
Speed up deleting of device messages when deleting a device.

View File

@ -396,15 +396,17 @@ class DeviceWorkerHandler:
up_to_stream_id = task.params["up_to_stream_id"] up_to_stream_id = task.params["up_to_stream_id"]
# Delete the messages in batches to avoid too much DB load. # Delete the messages in batches to avoid too much DB load.
from_stream_id = None
while True: while True:
res = await self.store.delete_messages_for_device( from_stream_id, _ = await self.store.delete_messages_for_device_between(
user_id=user_id, user_id=user_id,
device_id=device_id, device_id=device_id,
up_to_stream_id=up_to_stream_id, from_stream_id=from_stream_id,
to_stream_id=up_to_stream_id,
limit=DeviceHandler.DEVICE_MSGS_DELETE_BATCH_LIMIT, limit=DeviceHandler.DEVICE_MSGS_DELETE_BATCH_LIMIT,
) )
if res < DeviceHandler.DEVICE_MSGS_DELETE_BATCH_LIMIT: if from_stream_id is None:
return TaskStatus.COMPLETE, None, None return TaskStatus.COMPLETE, None, None
await self.clock.sleep(DeviceHandler.DEVICE_MSGS_DELETE_SLEEP_MS / 1000.0) await self.clock.sleep(DeviceHandler.DEVICE_MSGS_DELETE_SLEEP_MS / 1000.0)

View File

@ -450,14 +450,12 @@ class DeviceInboxWorkerStore(SQLBaseStore):
user_id: str, user_id: str,
device_id: Optional[str], device_id: Optional[str],
up_to_stream_id: int, up_to_stream_id: int,
limit: Optional[int] = None,
) -> int: ) -> int:
""" """
Args: Args:
user_id: The recipient user_id. user_id: The recipient user_id.
device_id: The recipient device_id. device_id: The recipient device_id.
up_to_stream_id: Where to delete messages up to. up_to_stream_id: Where to delete messages up to.
limit: maximum number of messages to delete
Returns: Returns:
The number of messages deleted. The number of messages deleted.
@ -478,32 +476,22 @@ class DeviceInboxWorkerStore(SQLBaseStore):
log_kv({"message": "No changes in cache since last check"}) log_kv({"message": "No changes in cache since last check"})
return 0 return 0
def delete_messages_for_device_txn(txn: LoggingTransaction) -> int: from_stream_id = None
limit_statement = "" if limit is None else f"LIMIT {limit}" count = 0
sql = f""" while True:
DELETE FROM device_inbox WHERE user_id = ? AND device_id = ? AND stream_id <= ( from_stream_id, loop_count = await self.delete_messages_for_device_between(
SELECT MAX(stream_id) FROM ( user_id,
SELECT stream_id FROM device_inbox device_id,
WHERE user_id = ? AND device_id = ? AND stream_id <= ? from_stream_id=from_stream_id,
ORDER BY stream_id to_stream_id=up_to_stream_id,
{limit_statement} limit=1000,
) AS q1
)
"""
txn.execute(sql, (user_id, device_id, user_id, device_id, up_to_stream_id))
return txn.rowcount
count = await self.db_pool.runInteraction(
"delete_messages_for_device", delete_messages_for_device_txn
) )
count += loop_count
if from_stream_id is None:
break
log_kv({"message": f"deleted {count} messages for device", "count": count}) log_kv({"message": f"deleted {count} messages for device", "count": count})
# In this case we don't know if we hit the limit or the delete is complete
# so let's not update the cache.
if count == limit:
return count
# Update the cache, ensuring that we only ever increase the value # Update the cache, ensuring that we only ever increase the value
updated_last_deleted_stream_id = self._last_device_delete_cache.get( updated_last_deleted_stream_id = self._last_device_delete_cache.get(
(user_id, device_id), 0 (user_id, device_id), 0
@ -514,6 +502,74 @@ class DeviceInboxWorkerStore(SQLBaseStore):
return count return count
@trace
async def delete_messages_for_device_between(
self,
user_id: str,
device_id: Optional[str],
from_stream_id: Optional[int],
to_stream_id: int,
limit: int,
) -> Tuple[Optional[int], int]:
"""Delete N device messages between the stream IDs, returning the
highest stream ID deleted (or None if all messages in the range have
been deleted) and the number of messages deleted.
This is more efficient than `delete_messages_for_device` when calling in
a loop to batch delete messages.
"""
# Keeping track of a lower bound of stream ID where we've deleted
# everything below makes the queries much faster. Otherwise, every time
# we scan for rows to delete we'd re-scan across all the rows that have
# previously deleted (until the next table VACUUM).
if from_stream_id is None:
# Minimum device stream ID is 1.
from_stream_id = 0
def delete_messages_for_device_between_txn(
txn: LoggingTransaction,
) -> Tuple[Optional[int], int]:
txn.execute(
"""
SELECT MAX(stream_id) FROM (
SELECT stream_id FROM device_inbox
WHERE user_id = ? AND device_id = ?
AND ? < stream_id AND stream_id <= ?
ORDER BY stream_id
LIMIT ?
) AS d
""",
(user_id, device_id, from_stream_id, to_stream_id, limit),
)
row = txn.fetchone()
if row is None or row[0] is None:
return None, 0
(max_stream_id,) = row
txn.execute(
"""
DELETE FROM device_inbox
WHERE user_id = ? AND device_id = ?
AND ? < stream_id AND stream_id <= ?
""",
(user_id, device_id, from_stream_id, max_stream_id),
)
num_deleted = txn.rowcount
if num_deleted < limit:
return None, num_deleted
return max_stream_id, num_deleted
return await self.db_pool.runInteraction(
"delete_messages_for_device_between",
delete_messages_for_device_between_txn,
db_autocommit=True, # We don't need to run in a transaction
)
@trace @trace
async def get_new_device_msgs_for_remote( async def get_new_device_msgs_for_remote(
self, destination: str, last_stream_id: int, current_stream_id: int, limit: int self, destination: str, last_stream_id: int, current_stream_id: int, limit: int

View File

@ -193,7 +193,7 @@ class TaskScheduler:
result: Optional[JsonMapping] = None, result: Optional[JsonMapping] = None,
error: Optional[str] = None, error: Optional[str] = None,
) -> bool: ) -> bool:
"""Update some task associated values. This is exposed publically so it can """Update some task associated values. This is exposed publicly so it can
be used inside task functions, mainly to update the result and be able to be used inside task functions, mainly to update the result and be able to
resume a task at a specific step after a restart of synapse. resume a task at a specific step after a restart of synapse.