Don't pull out the full state when calculating push actions (#13078)

2022-07-11 21:08:39 +01:00 · 2022-07-11 21:08:39 +01:00 · e5716b631c
parent bc8eefc1e1
commit e5716b631c
7 changed files with 164 additions and 344 deletions
--- a/changelog.d/13078.misc
+++ b/changelog.d/13078.misc
@ -0,0 +1 @@
 Reduce memory consumption when processing incoming events in large rooms.
--- a/changelog.d/13222.misc
+++ b/changelog.d/13222.misc
@ -1 +1 @@
-Improve memory usage of calculating push actions for events in large rooms.
+Reduce memory consumption when processing incoming events in large rooms.
--- a/synapse/push/bulk_push_rule_evaluator.py
+++ b/synapse/push/bulk_push_rule_evaluator.py
@ -17,7 +17,6 @@ import itertools
 import logging
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 import attr
 from prometheus_client import Counter
 from synapse.api.constants import EventTypes, Membership, RelationTypes
@ -26,13 +25,11 @@ from synapse.events import EventBase, relation_from_event
 from synapse.events.snapshot import EventContext
 from synapse.state import POWER_KEY
 from synapse.storage.databases.main.roommember import EventIdMembership
-from synapse.util.async_helpers import Linearizer
+from synapse.storage.state import StateFilter
-from synapse.util.caches import CacheMetric, register_cache
+from synapse.util.caches import register_cache
 from synapse.util.caches.descriptors import lru_cache
 from synapse.util.caches.lrucache import LruCache
 from synapse.util.metrics import measure_func
 from synapse.visibility import filter_event_for_clients_with_state
 from ..storage.state import StateFilter
 from .push_rule_evaluator import PushRuleEvaluatorForEvent
 if TYPE_CHECKING:
@ -48,15 +45,6 @@ push_rules_state_size_counter = Counter(
    "synapse_push_bulk_push_rule_evaluator_push_rules_state_size_counter", ""
 )
 # Measures whether we use the fast path of using state deltas, or if we have to
 # recalculate from scratch
 push_rules_delta_state_cache_metric = register_cache(
    "cache",
    "push_rules_delta_state_cache_metric",
    cache=[],  # Meaningless size, as this isn't a cache that stores values
    resizable=False,
 )
 STATE_EVENT_TYPES_TO_MARK_UNREAD = {
    EventTypes.Topic,
@ -111,10 +99,6 @@ class BulkPushRuleEvaluator:
        self.clock = hs.get_clock()
        self._event_auth_handler = hs.get_event_auth_handler()
        # Used by `RulesForRoom` to ensure only one thing mutates the cache at a
        # time. Keyed off room_id.
        self._rules_linearizer = Linearizer(name="rules_for_room")
        self.room_push_rule_cache_metrics = register_cache(
            "cache",
            "room_push_rule_cache",
@ -126,48 +110,48 @@ class BulkPushRuleEvaluator:
        self._relations_match_enabled = self.hs.config.experimental.msc3772_enabled
    async def _get_rules_for_event(
-        self, event: EventBase, context: EventContext
+        self,
        event: EventBase,
    ) -> Dict[str, List[Dict[str, Any]]]:
-        """This gets the rules for all users in the room at the time of the event,
+        """Get the push rules for all users who may need to be notified about
-        as well as the push rules for the invitee if the event is an invite.
+        the event.
        Note: this does not check if the user is allowed to see the event.
        Returns:
-            dict of user_id -> push_rules
+            Mapping of user ID to their push rules.
        """
-        room_id = event.room_id
+        # We get the users who may need to be notified by first fetching the
        # local users currently in the room, finding those that have push rules,
        # and *then* checking which users are actually allowed to see the event.
        #
        # The alternative is to first fetch all users that were joined at the
        # event, but that requires fetching the full state at the event, which
        # may be expensive for large rooms with few local users.
-        rules_for_room_data = self._get_rules_for_room(room_id)
+        local_users = await self.store.get_local_users_in_room(event.room_id)
        rules_for_room = RulesForRoom(
            hs=self.hs,
            room_id=room_id,
            rules_for_room_cache=self._get_rules_for_room.cache,
            room_push_rule_cache_metrics=self.room_push_rule_cache_metrics,
            linearizer=self._rules_linearizer,
            cached_data=rules_for_room_data,
        )
        rules_by_user = await rules_for_room.get_rules(event, context)
        # if this event is an invite event, we may need to run rules for the user
        # who's been invited, otherwise they won't get told they've been invited
-        if event.type == "m.room.member" and event.content["membership"] == "invite":
+        if event.type == EventTypes.Member and event.membership == Membership.INVITE:
            invited = event.state_key
-            if invited and self.hs.is_mine_id(invited):
+            if invited and self.hs.is_mine_id(invited) and invited not in local_users:
-                rules_by_user = dict(rules_by_user)
+                local_users = list(local_users)
-                rules_by_user[invited] = await self.store.get_push_rules_for_user(
+                local_users.append(invited)
-                    invited
+
-                )
+        rules_by_user = await self.store.bulk_get_push_rules(local_users)
        logger.debug("Users in room: %s", local_users)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(
                "Returning push rules for %r %r",
                event.room_id,
                list(rules_by_user.keys()),
            )
        return rules_by_user
    @lru_cache()
    def _get_rules_for_room(self, room_id: str) -> "RulesForRoomData":
        """Get the current RulesForRoomData object for the given room id"""
        # It's important that the RulesForRoomData object gets added to self._get_rules_for_room.cache
        # before any lookup methods get called on it as otherwise there may be
        # a race if invalidate_all gets called (which assumes its in the cache)
        return RulesForRoomData()
    async def _get_power_levels_and_sender_level(
        self, event: EventBase, context: EventContext
    ) -> Tuple[dict, int]:
@ -262,10 +246,12 @@ class BulkPushRuleEvaluator:
        count_as_unread = _should_count_as_unread(event, context)
-        rules_by_user = await self._get_rules_for_event(event, context)
+        rules_by_user = await self._get_rules_for_event(event)
        actions_by_user: Dict[str, List[Union[dict, str]]] = {}
-        room_members = await self.store.get_joined_users_from_context(event, context)
+        room_member_count = await self.store.get_number_joined_users_in_room(
            event.room_id
        )
        (
            power_levels,
@ -278,30 +264,36 @@ class BulkPushRuleEvaluator:
        evaluator = PushRuleEvaluatorForEvent(
            event,
-            len(room_members),
+            room_member_count,
            sender_power_level,
            power_levels,
            relations,
            self._relations_match_enabled,
        )
-        # If the event is not a state event check if any users ignore the sender.
+        users = rules_by_user.keys()
-        if not event.is_state():
+        profiles = await self.store.get_subset_users_in_room_with_profiles(
-            ignorers = await self.store.ignored_by(event.sender)
+            event.room_id, users
-        else:
+        )
-            ignorers = frozenset()
+
        # This is a check for the case where user joins a room without being
        # allowed to see history, and then the server receives a delayed event
        # from before the user joined, which they should not be pushed for
        uids_with_visibility = await filter_event_for_clients_with_state(
            self.store, users, event, context
        )
        for uid, rules in rules_by_user.items():
            if event.sender == uid:
                continue
-            if uid in ignorers:
+            if uid not in uids_with_visibility:
                continue
            display_name = None
-            profile_info = room_members.get(uid)
+            profile = profiles.get(uid)
-            if profile_info:
+            if profile:
-                display_name = profile_info.display_name
+                display_name = profile.display_name
            if not display_name:
                # Handle the case where we are pushing a membership event to
@ -346,283 +338,3 @@ MemberMap = Dict[str, Optional[EventIdMembership]]
 Rule = Dict[str, dict]
 RulesByUser = Dict[str, List[Rule]]
 StateGroup = Union[object, int]
@attr.s(slots=True, auto_attribs=True)
 class RulesForRoomData:
    """The data stored in the cache by `RulesForRoom`.
    We don't store `RulesForRoom` directly in the cache as we want our caches to
    *only* include data, and not references to e.g. the data stores.
    """
    # event_id -> EventIdMembership
    member_map: MemberMap = attr.Factory(dict)
    # user_id -> rules
    rules_by_user: RulesByUser = attr.Factory(dict)
    # The last state group we updated the caches for. If the state_group of
    # a new event comes along, we know that we can just return the cached
    # result.
    # On invalidation of the rules themselves (if the user changes them),
    # we invalidate everything and set state_group to `object()`
    state_group: StateGroup = attr.Factory(object)
    # A sequence number to keep track of when we're allowed to update the
    # cache. We bump the sequence number when we invalidate the cache. If
    # the sequence number changes while we're calculating stuff we should
    # not update the cache with it.
    sequence: int = 0
    # A cache of user_ids that we *know* aren't interesting, e.g. user_ids
    # owned by AS's, or remote users, etc. (I.e. users we will never need to
    # calculate push for)
    # These never need to be invalidated as we will never set up push for
    # them.
    uninteresting_user_set: Set[str] = attr.Factory(set)
 class RulesForRoom:
    """Caches push rules for users in a room.
    This efficiently handles users joining/leaving the room by not invalidating
    the entire cache for the room.
    A new instance is constructed for each call to
    `BulkPushRuleEvaluator._get_rules_for_event`, with the cached data from
    previous calls passed in.
    """
    def __init__(
        self,
        hs: "HomeServer",
        room_id: str,
        rules_for_room_cache: LruCache,
        room_push_rule_cache_metrics: CacheMetric,
        linearizer: Linearizer,
        cached_data: RulesForRoomData,
    ):
        """
        Args:
            hs: The HomeServer object.
            room_id: The room ID.
            rules_for_room_cache: The cache object that caches these
                RoomsForUser objects.
            room_push_rule_cache_metrics: The metrics object
            linearizer: The linearizer used to ensure only one thing mutates
                the cache at a time. Keyed off room_id
            cached_data: Cached data from previous calls to `self.get_rules`,
                can be mutated.
        """
        self.room_id = room_id
        self.is_mine_id = hs.is_mine_id
        self.store = hs.get_datastores().main
        self.room_push_rule_cache_metrics = room_push_rule_cache_metrics
        # Used to ensure only one thing mutates the cache at a time. Keyed off
        # room_id.
        self.linearizer = linearizer
        self.data = cached_data
        # We need to be clever on the invalidating caches callbacks, as
        # otherwise the invalidation callback holds a reference to the object,
        # potentially causing it to leak.
        # To get around this we pass a function that on invalidations looks ups
        # the RoomsForUser entry in the cache, rather than keeping a reference
        # to self around in the callback.
        self.invalidate_all_cb = _Invalidation(rules_for_room_cache, room_id)
    async def get_rules(
        self, event: EventBase, context: EventContext
    ) -> Dict[str, List[Dict[str, dict]]]:
        """Given an event context return the rules for all users who are
        currently in the room.
        """
        state_group = context.state_group
        if state_group and self.data.state_group == state_group:
            logger.debug("Using cached rules for %r", self.room_id)
            self.room_push_rule_cache_metrics.inc_hits()
            return self.data.rules_by_user
        async with self.linearizer.queue(self.room_id):
            if state_group and self.data.state_group == state_group:
                logger.debug("Using cached rules for %r", self.room_id)
                self.room_push_rule_cache_metrics.inc_hits()
                return self.data.rules_by_user
            self.room_push_rule_cache_metrics.inc_misses()
            ret_rules_by_user = {}
            missing_member_event_ids = {}
            if state_group and self.data.state_group == context.prev_group:
                # If we have a simple delta then we can reuse most of the previous
                # results.
                ret_rules_by_user = self.data.rules_by_user
                current_state_ids = context.delta_ids
                push_rules_delta_state_cache_metric.inc_hits()
            else:
                current_state_ids = await context.get_current_state_ids()
                push_rules_delta_state_cache_metric.inc_misses()
            # Ensure the state IDs exist.
            assert current_state_ids is not None
            push_rules_state_size_counter.inc(len(current_state_ids))
            logger.debug(
                "Looking for member changes in %r %r", state_group, current_state_ids
            )
            # Loop through to see which member events we've seen and have rules
            # for and which we need to fetch
            for key in current_state_ids:
                typ, user_id = key
                if typ != EventTypes.Member:
                    continue
                if user_id in self.data.uninteresting_user_set:
                    continue
                if not self.is_mine_id(user_id):
                    self.data.uninteresting_user_set.add(user_id)
                    continue
                if self.store.get_if_app_services_interested_in_user(user_id):
                    self.data.uninteresting_user_set.add(user_id)
                    continue
                event_id = current_state_ids[key]
                res = self.data.member_map.get(event_id, None)
                if res:
                    if res.membership == Membership.JOIN:
                        rules = self.data.rules_by_user.get(res.user_id, None)
                        if rules:
                            ret_rules_by_user[res.user_id] = rules
                    continue
                # If a user has left a room we remove their push rule. If they
                # joined then we re-add it later in _update_rules_with_member_event_ids
                ret_rules_by_user.pop(user_id, None)
                missing_member_event_ids[user_id] = event_id
            if missing_member_event_ids:
                # If we have some member events we haven't seen, look them up
                # and fetch push rules for them if appropriate.
                logger.debug("Found new member events %r", missing_member_event_ids)
                await self._update_rules_with_member_event_ids(
                    ret_rules_by_user, missing_member_event_ids, state_group, event
                )
            else:
                # The push rules didn't change but lets update the cache anyway
                self.update_cache(
                    self.data.sequence,
                    members={},  # There were no membership changes
                    rules_by_user=ret_rules_by_user,
                    state_group=state_group,
                )
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(
                "Returning push rules for %r %r", self.room_id, ret_rules_by_user.keys()
            )
        return ret_rules_by_user
    async def _update_rules_with_member_event_ids(
        self,
        ret_rules_by_user: Dict[str, list],
        member_event_ids: Dict[str, str],
        state_group: Optional[int],
        event: EventBase,
    ) -> None:
        """Update the partially filled rules_by_user dict by fetching rules for
        any newly joined users in the `member_event_ids` list.
        Args:
            ret_rules_by_user: Partially filled dict of push rules. Gets
                updated with any new rules.
            member_event_ids: Dict of user id to event id for membership events
                that have happened since the last time we filled rules_by_user
            state_group: The state group we are currently computing push rules
                for. Used when updating the cache.
            event: The event we are currently computing push rules for.
        """
        sequence = self.data.sequence
        members = await self.store.get_membership_from_event_ids(
            member_event_ids.values()
        )
        # If the event is a join event then it will be in current state events
        # map but not in the DB, so we have to explicitly insert it.
        if event.type == EventTypes.Member:
            for event_id in member_event_ids.values():
                if event_id == event.event_id:
                    members[event_id] = EventIdMembership(
                        user_id=event.state_key, membership=event.membership
                    )
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Found members %r: %r", self.room_id, members.values())
        joined_user_ids = {
            entry.user_id
            for entry in members.values()
            if entry and entry.membership == Membership.JOIN
        }
        logger.debug("Joined: %r", joined_user_ids)
        # Previously we only considered users with pushers or read receipts in that
        # room. We can't do this anymore because we use push actions to calculate unread
        # counts, which don't rely on the user having pushers or sent a read receipt into
        # the room. Therefore we just need to filter for local users here.
        user_ids = list(filter(self.is_mine_id, joined_user_ids))
        rules_by_user = await self.store.bulk_get_push_rules(
            user_ids, on_invalidate=self.invalidate_all_cb
        )
        ret_rules_by_user.update(
            item for item in rules_by_user.items() if item[0] is not None
        )
        self.update_cache(sequence, members, ret_rules_by_user, state_group)
    def update_cache(
        self,
        sequence: int,
        members: MemberMap,
        rules_by_user: RulesByUser,
        state_group: StateGroup,
    ) -> None:
        if sequence == self.data.sequence:
            self.data.member_map.update(members)
            self.data.rules_by_user = rules_by_user
            self.data.state_group = state_group
@attr.attrs(slots=True, frozen=True, auto_attribs=True)
 class _Invalidation:
    # _Invalidation is passed as an `on_invalidate` callback to bulk_get_push_rules,
    # which means that it it is stored on the bulk_get_push_rules cache entry. In order
    # to ensure that we don't accumulate lots of redundant callbacks on the cache entry,
    # we need to ensure that two _Invalidation objects are "equal" if they refer to the
    # same `cache` and `room_id`.
    #
    # attrs provides suitable __hash__ and __eq__ methods, provided we remember to
    # set `frozen=True`.
    cache: LruCache
    room_id: str
    def __call__(self) -> None:
        rules_data = self.cache.get(self.room_id, None, update_metrics=False)
        if rules_data:
            rules_data.sequence += 1
            rules_data.state_group = object()
            rules_data.member_map = {}
            rules_data.rules_by_user = {}
            push_rules_invalidation_counter.inc()
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@ -75,6 +75,15 @@ class SQLBaseStore(metaclass=ABCMeta):
            self._attempt_to_invalidate_cache(
                "get_users_in_room_with_profiles", (room_id,)
            )
            self._attempt_to_invalidate_cache(
                "get_number_joined_users_in_room", (room_id,)
            )
            self._attempt_to_invalidate_cache("get_local_users_in_room", (room_id,))
        for user_id in members_changed:
            self._attempt_to_invalidate_cache(
                "get_user_in_room_with_profile", (room_id, user_id)
            )
        # Purge other caches based on room state.
        self._attempt_to_invalidate_cache("get_room_summary", (room_id,))
--- a/synapse/storage/databases/main/events.py
+++ b/synapse/storage/databases/main/events.py
@ -1797,6 +1797,18 @@ class PersistEventsStore:
                self.store.get_invited_rooms_for_local_user.invalidate,
                (event.state_key,),
            )
            txn.call_after(
                self.store.get_local_users_in_room.invalidate,
                (event.room_id,),
            )
            txn.call_after(
                self.store.get_number_joined_users_in_room.invalidate,
                (event.room_id,),
            )
            txn.call_after(
                self.store.get_user_in_room_with_profile.invalidate,
                (event.room_id, event.state_key),
            )
            # The `_get_membership_from_event_id` is immutable, except for the
            # case where we look up an event *before* persisting it.
--- a/synapse/storage/databases/main/roommember.py
+++ b/synapse/storage/databases/main/roommember.py
@ -212,6 +212,60 @@ class RoomMemberWorkerStore(EventsWorkerStore):
        txn.execute(sql, (room_id, Membership.JOIN))
        return [r[0] for r in txn]
    @cached()
    def get_user_in_room_with_profile(
        self, room_id: str, user_id: str
    ) -> Dict[str, ProfileInfo]:
        raise NotImplementedError()
    @cachedList(
        cached_method_name="get_user_in_room_with_profile", list_name="user_ids"
    )
    async def get_subset_users_in_room_with_profiles(
        self, room_id: str, user_ids: Collection[str]
    ) -> Dict[str, ProfileInfo]:
        """Get a mapping from user ID to profile information for a list of users
        in a given room.
        The profile information comes directly from this room's `m.room.member`
        events, and so may be specific to this room rather than part of a user's
        global profile. To avoid privacy leaks, the profile data should only be
        revealed to users who are already in this room.
        Args:
            room_id: The ID of the room to retrieve the users of.
            user_ids: a list of users in the room to run the query for
        Returns:
                A mapping from user ID to ProfileInfo.
        """
        def _get_subset_users_in_room_with_profiles(
            txn: LoggingTransaction,
        ) -> Dict[str, ProfileInfo]:
            clause, ids = make_in_list_sql_clause(
                self.database_engine, "m.user_id", user_ids
            )
            sql = """
                SELECT state_key, display_name, avatar_url FROM room_memberships as m
                INNER JOIN current_state_events as c
                ON m.event_id = c.event_id
                AND m.room_id = c.room_id
                AND m.user_id = c.state_key
                WHERE c.type = 'm.room.member' AND c.room_id = ? AND m.membership = ? AND %s
            """ % (
                clause,
            )
            txn.execute(sql, (room_id, Membership.JOIN, *ids))
            return {r[0]: ProfileInfo(display_name=r[1], avatar_url=r[2]) for r in txn}
        return await self.db_pool.runInteraction(
            "get_subset_users_in_room_with_profiles",
            _get_subset_users_in_room_with_profiles,
        )
    @cached(max_entries=100000, iterable=True)
    async def get_users_in_room_with_profiles(
        self, room_id: str
@ -337,6 +391,15 @@ class RoomMemberWorkerStore(EventsWorkerStore):
            "get_room_summary", _get_room_summary_txn
        )
    @cached()
    async def get_number_joined_users_in_room(self, room_id: str) -> int:
        return await self.db_pool.simple_select_one_onecol(
            table="current_state_events",
            keyvalues={"room_id": room_id, "membership": Membership.JOIN},
            retcol="COUNT(*)",
            desc="get_number_joined_users_in_room",
        )
    @cached()
    async def get_invited_rooms_for_local_user(
        self, user_id: str
@ -416,6 +479,17 @@ class RoomMemberWorkerStore(EventsWorkerStore):
        user_id: str,
        membership_list: List[str],
    ) -> List[RoomsForUser]:
        """Get all the rooms for this *local* user where the membership for this user
        matches one in the membership list.
        Args:
            user_id: The user ID.
            membership_list: A list of synapse.api.constants.Membership
                    values which the user must be in.
        Returns:
            The RoomsForUser that the user matches the membership types.
        """
        # Paranoia check.
        if not self.hs.is_mine_id(user_id):
            raise Exception(
@ -444,6 +518,18 @@ class RoomMemberWorkerStore(EventsWorkerStore):
        return results
    @cached(iterable=True)
    async def get_local_users_in_room(self, room_id: str) -> List[str]:
        """
        Retrieves a list of the current roommembers who are local to the server.
        """
        return await self.db_pool.simple_select_onecol(
            table="local_current_membership",
            keyvalues={"room_id": room_id, "membership": Membership.JOIN},
            retcol="user_id",
            desc="get_local_users_in_room",
        )
    async def get_local_current_membership_for_user_in_room(
        self, user_id: str, room_id: str
    ) -> Tuple[Optional[str], Optional[str]]:
--- a/tests/rest/client/test_rooms.py
+++ b/tests/rest/client/test_rooms.py
@ -709,7 +709,7 @@ class RoomsCreateTestCase(RoomBase):
        self.assertEqual(200, channel.code, channel.result)
        self.assertTrue("room_id" in channel.json_body)
        assert channel.resource_usage is not None
-        self.assertEqual(33, channel.resource_usage.db_txn_count)
+        self.assertEqual(37, channel.resource_usage.db_txn_count)
    def test_post_room_initial_state(self) -> None:
        # POST with initial_state config key, expect new room id
@ -722,7 +722,7 @@ class RoomsCreateTestCase(RoomBase):
        self.assertEqual(200, channel.code, channel.result)
        self.assertTrue("room_id" in channel.json_body)
        assert channel.resource_usage is not None
-        self.assertEqual(37, channel.resource_usage.db_txn_count)
+        self.assertEqual(41, channel.resource_usage.db_txn_count)
    def test_post_room_visibility_key(self) -> None:
        # POST with visibility config key, expect new room id
		`@ -0,0 +1 @@`
							`Reduce memory consumption when processing incoming events in large rooms.`
`@ -1 +1 @@`
	`Improve memory usage of calculating push actions for events in large rooms.`	`Reduce memory consumption when processing incoming events in large rooms.`