diff --git a/changelog.d/12972.feature b/changelog.d/12972.feature new file mode 100644 index 0000000000..3c73363d28 --- /dev/null +++ b/changelog.d/12972.feature @@ -0,0 +1 @@ +Add new `media_retention` options to the homeserver config for routinely cleaning up non-recently accessed media. \ No newline at end of file diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index 1c75a23a36..392ae80a75 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1583,6 +1583,12 @@ been accessed, the media's creation time is used instead. Both thumbnails and the original media will be removed. If either of these options are unset, then media of that type will not be purged. +Local or cached remote media that has been +[quarantined](../../admin_api/media_admin_api.md#quarantining-media-in-a-room) +will not be deleted. Similarly, local media that has been marked as +[protected from quarantine](../../admin_api/media_admin_api.md#protecting-media-from-being-quarantined) +will not be deleted. + Example configuration: ```yaml media_retention: diff --git a/synapse/rest/admin/media.py b/synapse/rest/admin/media.py index 8ca57bdb28..19d4a008e8 100644 --- a/synapse/rest/admin/media.py +++ b/synapse/rest/admin/media.py @@ -83,7 +83,7 @@ class QuarantineMediaByUser(RestServlet): requester = await self.auth.get_user_by_req(request) await assert_user_is_admin(self.auth, requester.user) - logging.info("Quarantining local media by user: %s", user_id) + logging.info("Quarantining media by user: %s", user_id) # Quarantine all media this user has uploaded num_quarantined = await self.store.quarantine_media_ids_by_user( @@ -112,7 +112,7 @@ class QuarantineMediaByID(RestServlet): requester = await self.auth.get_user_by_req(request) await assert_user_is_admin(self.auth, requester.user) - logging.info("Quarantining local media by ID: %s/%s", server_name, media_id) + logging.info("Quarantining media by ID: %s/%s", server_name, media_id) # Quarantine this media id await self.store.quarantine_media_by_id( @@ -140,9 +140,7 @@ class UnquarantineMediaByID(RestServlet): ) -> Tuple[int, JsonDict]: await assert_requester_is_admin(self.auth, request) - logging.info( - "Remove from quarantine local media by ID: %s/%s", server_name, media_id - ) + logging.info("Remove from quarantine media by ID: %s/%s", server_name, media_id) # Remove from quarantine this media id await self.store.quarantine_media_by_id(server_name, media_id, None) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index a551458a9f..7435fd9130 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -919,10 +919,14 @@ class MediaRepository: await self.delete_old_local_media( before_ts=local_media_threshold_timestamp_ms, keep_profiles=True, + delete_quarantined_media=False, + delete_protected_media=False, ) async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]: - old_media = await self.store.get_remote_media_before(before_ts) + old_media = await self.store.get_remote_media_ids( + before_ts, include_quarantined_media=False + ) deleted = 0 @@ -975,6 +979,8 @@ class MediaRepository: before_ts: int, size_gt: int = 0, keep_profiles: bool = True, + delete_quarantined_media: bool = False, + delete_protected_media: bool = False, ) -> Tuple[List[str], int]: """ Delete local or remote media from this server by size and timestamp. Removes @@ -982,18 +988,22 @@ class MediaRepository: Args: before_ts: Unix timestamp in ms. - Files that were last used before this timestamp will be deleted - size_gt: Size of the media in bytes. Files that are larger will be deleted + Files that were last used before this timestamp will be deleted. + size_gt: Size of the media in bytes. Files that are larger will be deleted. keep_profiles: Switch to delete also files that are still used in image data - (e.g user profile, room avatar) - If false these files will be deleted + (e.g user profile, room avatar). If false these files will be deleted. + delete_quarantined_media: If True, media marked as quarantined will be deleted. + delete_protected_media: If True, media marked as protected will be deleted. + Returns: A tuple of (list of deleted media IDs, total deleted media IDs). """ - old_media = await self.store.get_local_media_before( + old_media = await self.store.get_local_media_ids( before_ts, size_gt, keep_profiles, + include_quarantined_media=delete_quarantined_media, + include_protected_media=delete_protected_media, ) return await self._remove_local_media_from_disk(old_media) diff --git a/synapse/storage/databases/main/media_repository.py b/synapse/storage/databases/main/media_repository.py index deffdc19ce..3c585c555a 100644 --- a/synapse/storage/databases/main/media_repository.py +++ b/synapse/storage/databases/main/media_repository.py @@ -251,12 +251,36 @@ class MediaRepositoryStore(MediaRepositoryBackgroundUpdateStore): "get_local_media_by_user_paginate_txn", get_local_media_by_user_paginate_txn ) - async def get_local_media_before( + async def get_local_media_ids( self, before_ts: int, size_gt: int, keep_profiles: bool, + include_quarantined_media: bool, + include_protected_media: bool, ) -> List[str]: + """ + Retrieve a list of media IDs from the local media store. + + Args: + before_ts: Only retrieve IDs from media that was either last accessed + (or if never accessed, created) before the given UNIX timestamp in ms. + size_gt: Only retrieve IDs from media that has a size (in bytes) greater than + the given integer. + keep_profiles: If True, exclude media IDs from the results that are used in the + following situations: + * global profile user avatar + * per-room profile user avatar + * room avatar + * a user's avatar in the user directory + include_quarantined_media: If False, exclude media IDs from the results that have + been marked as quarantined. + include_protected_media: If False, exclude media IDs from the results that have + been marked as protected from quarantine. + + Returns: + A list of local media IDs. + """ # to find files that have never been accessed (last_access_ts IS NULL) # compare with `created_ts` @@ -294,12 +318,24 @@ class MediaRepositoryStore(MediaRepositoryBackgroundUpdateStore): ) sql += sql_keep - def _get_local_media_before_txn(txn: LoggingTransaction) -> List[str]: + if include_quarantined_media is False: + # Do not include media that has been quarantined + sql += """ + AND quarantined_by IS NULL + """ + + if include_protected_media is False: + # Do not include media that has been protected from quarantine + sql += """ + AND safe_from_quarantine = false + """ + + def _get_local_media_ids_txn(txn: LoggingTransaction) -> List[str]: txn.execute(sql, (before_ts, before_ts, size_gt)) return [row[0] for row in txn] return await self.db_pool.runInteraction( - "get_local_media_before", _get_local_media_before_txn + "get_local_media_ids", _get_local_media_ids_txn ) async def store_local_media( @@ -599,15 +635,37 @@ class MediaRepositoryStore(MediaRepositoryBackgroundUpdateStore): desc="store_remote_media_thumbnail", ) - async def get_remote_media_before(self, before_ts: int) -> List[Dict[str, str]]: + async def get_remote_media_ids( + self, before_ts: int, include_quarantined_media: bool + ) -> List[Dict[str, str]]: + """ + Retrieve a list of server name, media ID tuples from the remote media cache. + + Args: + before_ts: Only retrieve IDs from media that was either last accessed + (or if never accessed, created) before the given UNIX timestamp in ms. + include_quarantined_media: If False, exclude media IDs from the results that have + been marked as quarantined. + + Returns: + A list of tuples containing: + * The server name of homeserver where the media originates from, + * The ID of the media. + """ sql = ( "SELECT media_origin, media_id, filesystem_id" " FROM remote_media_cache" " WHERE last_access_ts < ?" ) + if include_quarantined_media is False: + # Only include media that has not been quarantined + sql += """ + AND quarantined_by IS NULL + """ + return await self.db_pool.execute( - "get_remote_media_before", self.db_pool.cursor_to_dict, sql, before_ts + "get_remote_media_ids", self.db_pool.cursor_to_dict, sql, before_ts ) async def delete_remote_media(self, media_origin: str, media_id: str) -> None: diff --git a/tests/rest/media/test_media_retention.py b/tests/rest/media/test_media_retention.py index b98a5cd586..14af07c5af 100644 --- a/tests/rest/media/test_media_retention.py +++ b/tests/rest/media/test_media_retention.py @@ -53,13 +53,16 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase): # Create a user to upload media with test_user_id = self.register_user("alice", "password") - # Inject media (3 images each; recently accessed, old access, never accessed) - # into both the local store and the remote cache + # Inject media (recently accessed, old access, never accessed, old access + # quarantined media) into both the local store and the remote cache, plus + # one additional local media that is marked as protected from quarantine. media_repository = hs.get_media_repository() test_media_content = b"example string" - def _create_media_and_set_last_accessed( + def _create_media_and_set_attributes( last_accessed_ms: Optional[int], + is_quarantined: Optional[bool] = False, + is_protected: Optional[bool] = False, ) -> str: # "Upload" some media to the local media store mxc_uri = self.get_success( @@ -84,10 +87,31 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase): ) ) + if is_quarantined: + # Mark this media as quarantined + self.get_success( + self.store.quarantine_media_by_id( + server_name=self.hs.config.server.server_name, + media_id=media_id, + quarantined_by="@theadmin:test", + ) + ) + + if is_protected: + # Mark this media as protected from quarantine + self.get_success( + self.store.mark_local_media_as_safe( + media_id=media_id, + safe=True, + ) + ) + return media_id - def _cache_remote_media_and_set_last_accessed( - media_id: str, last_accessed_ms: Optional[int] + def _cache_remote_media_and_set_attributes( + media_id: str, + last_accessed_ms: Optional[int], + is_quarantined: Optional[bool] = False, ) -> str: # Pretend to cache some remote media self.get_success( @@ -112,23 +136,58 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase): ) ) + if is_quarantined: + # Mark this media as quarantined + self.get_success( + self.store.quarantine_media_by_id( + server_name=self.remote_server_name, + media_id=media_id, + quarantined_by="@theadmin:test", + ) + ) + return media_id # Start with the local media store - self.local_recently_accessed_media = _create_media_and_set_last_accessed( - self.THIRTY_DAYS_IN_MS + self.local_recently_accessed_media = _create_media_and_set_attributes( + last_accessed_ms=self.THIRTY_DAYS_IN_MS, ) - self.local_not_recently_accessed_media = _create_media_and_set_last_accessed( - self.ONE_DAY_IN_MS + self.local_not_recently_accessed_media = _create_media_and_set_attributes( + last_accessed_ms=self.ONE_DAY_IN_MS, + ) + self.local_not_recently_accessed_quarantined_media = ( + _create_media_and_set_attributes( + last_accessed_ms=self.ONE_DAY_IN_MS, + is_quarantined=True, + ) + ) + self.local_not_recently_accessed_protected_media = ( + _create_media_and_set_attributes( + last_accessed_ms=self.ONE_DAY_IN_MS, + is_protected=True, + ) + ) + self.local_never_accessed_media = _create_media_and_set_attributes( + last_accessed_ms=None, ) - self.local_never_accessed_media = _create_media_and_set_last_accessed(None) # And now the remote media store - self.remote_recently_accessed_media = _cache_remote_media_and_set_last_accessed( - "a", self.THIRTY_DAYS_IN_MS + self.remote_recently_accessed_media = _cache_remote_media_and_set_attributes( + media_id="a", + last_accessed_ms=self.THIRTY_DAYS_IN_MS, ) self.remote_not_recently_accessed_media = ( - _cache_remote_media_and_set_last_accessed("b", self.ONE_DAY_IN_MS) + _cache_remote_media_and_set_attributes( + media_id="b", + last_accessed_ms=self.ONE_DAY_IN_MS, + ) + ) + self.remote_not_recently_accessed_quarantined_media = ( + _cache_remote_media_and_set_attributes( + media_id="c", + last_accessed_ms=self.ONE_DAY_IN_MS, + is_quarantined=True, + ) ) # Remote media will always have a "last accessed" attribute, as it would not # be fetched from the remote homeserver unless instigated by a user. @@ -163,8 +222,20 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase): ], not_purged=[ (self.hs.config.server.server_name, self.local_recently_accessed_media), + ( + self.hs.config.server.server_name, + self.local_not_recently_accessed_quarantined_media, + ), + ( + self.hs.config.server.server_name, + self.local_not_recently_accessed_protected_media, + ), (self.remote_server_name, self.remote_recently_accessed_media), (self.remote_server_name, self.remote_not_recently_accessed_media), + ( + self.remote_server_name, + self.remote_not_recently_accessed_quarantined_media, + ), ], ) @@ -199,6 +270,18 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase): self.hs.config.server.server_name, self.local_not_recently_accessed_media, ), + ( + self.hs.config.server.server_name, + self.local_not_recently_accessed_quarantined_media, + ), + ( + self.hs.config.server.server_name, + self.local_not_recently_accessed_protected_media, + ), + ( + self.remote_server_name, + self.remote_not_recently_accessed_quarantined_media, + ), (self.hs.config.server.server_name, self.local_never_accessed_media), ], )