Use dictionary cache to do group -> state fetching

2015-08-05 15:06:51 +01:00 · 2015-08-05 15:06:51 +01:00 · 07507643cb
parent c67ba143fa
commit 07507643cb
7 changed files with 195 additions and 110 deletions
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@ -507,7 +507,7 @@ class FederationHandler(BaseHandler):
        event_ids = list(extremities.keys())
        states = yield defer.gatherResults([
-            self.state_handler.resolve_state_groups([e])
+            self.state_handler.resolve_state_groups(room_id, [e])
            for e in event_ids
        ])
        states = dict(zip(event_ids, [s[1] for s in states]))
--- a/synapse/state.py
+++ b/synapse/state.py
@ -96,7 +96,7 @@ class StateHandler(object):
            cache.ts = self.clock.time_msec()
            state = cache.state
        else:
-            res = yield self.resolve_state_groups(event_ids)
+            res = yield self.resolve_state_groups(room_id, event_ids)
            state = res[1]
        if event_type:
@ -155,13 +155,13 @@ class StateHandler(object):
        if event.is_state():
            ret = yield self.resolve_state_groups(
-                [e for e, _ in event.prev_events],
+                event.room_id, [e for e, _ in event.prev_events],
                event_type=event.type,
                state_key=event.state_key,
            )
        else:
            ret = yield self.resolve_state_groups(
-                [e for e, _ in event.prev_events],
+                event.room_id, [e for e, _ in event.prev_events],
            )
        group, curr_state, prev_state = ret
@ -180,7 +180,7 @@ class StateHandler(object):
    @defer.inlineCallbacks
    @log_function
-    def resolve_state_groups(self, event_ids, event_type=None, state_key=""):
+    def resolve_state_groups(self, room_id, event_ids, event_type=None, state_key=""):
        """ Given a list of event_ids this method fetches the state at each
        event, resolves conflicts between them and returns them.
@ -205,7 +205,7 @@ class StateHandler(object):
                )
        state_groups = yield self.store.get_state_groups(
-            event_ids
+            room_id, event_ids
        )
        logger.debug(
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@ -18,6 +18,7 @@ from synapse.api.errors import StoreError
 from synapse.util.logutils import log_function
 from synapse.util.logcontext import preserve_context_over_fn, LoggingContext
 from synapse.util.lrucache import LruCache
 from synapse.util.dictionary_cache import DictionaryCache
 import synapse.metrics
 from util.id_generators import IdGenerator, StreamIdGenerator
@ -87,23 +88,33 @@ class Cache(object):
                )
    def get(self, *keyargs):
-        if len(keyargs) != self.keylen:
+        try:
-            raise ValueError("Expected a key to have %d items", self.keylen)
+            if len(keyargs) != self.keylen:
                raise ValueError("Expected a key to have %d items", self.keylen)
-        val = self.cache.get(keyargs, self.sentinel)
+            val = self.cache.get(keyargs, self.sentinel)
-        if val is not self.sentinel:
+            if val is not self.sentinel:
-            cache_counter.inc_hits(self.name)
+                cache_counter.inc_hits(self.name)
-            return val
+                return val
-        cache_counter.inc_misses(self.name)
+            cache_counter.inc_misses(self.name)
-        raise KeyError()
+            raise KeyError()
        except KeyError:
            raise
        except:
            logger.exception("Cache.get failed for %s" % (self.name,))
            raise
    def update(self, sequence, *args):
-        self.check_thread()
+        try:
-        if self.sequence == sequence:
+            self.check_thread()
-            # Only update the cache if the caches sequence number matches the
+            if self.sequence == sequence:
-            # number that the cache had before the SELECT was started (SYN-369)
+                # Only update the cache if the caches sequence number matches the
-            self.prefill(*args)
+                # number that the cache had before the SELECT was started (SYN-369)
                self.prefill(*args)
        except:
            logger.exception("Cache.update failed for %s" % (self.name,))
            raise
    def prefill(self, *args):  # because I can't  *keyargs, value
        keyargs = args[:-1]
@ -327,6 +338,8 @@ class SQLBaseStore(object):
        self._get_event_cache = Cache("*getEvent*", keylen=3, lru=True,
                                      max_entries=hs.config.event_cache_size)
        self._state_group_cache = DictionaryCache("*stateGroupCache*", 100000)
        self._event_fetch_lock = threading.Condition()
        self._event_fetch_list = []
        self._event_fetch_ongoing = 0
--- a/synapse/storage/state.py
+++ b/synapse/storage/state.py
@ -45,52 +45,38 @@ class StateStore(SQLBaseStore):
    """
    @defer.inlineCallbacks
-    def get_state_groups(self, event_ids):
+    def get_state_groups(self, room_id, event_ids):
        """ Get the state groups for the given list of event_ids
        The return value is a dict mapping group names to lists of events.
        """
-        def f(txn):
+        event_and_groups = yield defer.gatherResults(
            groups = set()
            for event_id in event_ids:
                group = self._simple_select_one_onecol_txn(
                    txn,
                    table="event_to_state_groups",
                    keyvalues={"event_id": event_id},
                    retcol="state_group",
                    allow_none=True,
                )
                if group:
                    groups.add(group)
            res = {}
            for group in groups:
                state_ids = self._simple_select_onecol_txn(
                    txn,
                    table="state_groups_state",
                    keyvalues={"state_group": group},
                    retcol="event_id",
                )
                res[group] = state_ids
            return res
        states = yield self.runInteraction(
            "get_state_groups",
            f,
        )
        state_list = yield defer.gatherResults(
            [
-                self._fetch_events_for_group(group, vals)
+                self._get_state_group_for_event(
-                for group, vals in states.items()
+                    room_id, event_id,
                ).addCallback(lambda group, event_id: (event_id, group), event_id)
                for event_id in event_ids
            ],
            consumeErrors=True,
-        )
+        ).addErrback(unwrapFirstError)
-        defer.returnValue(dict(state_list))
+        groups = set(group for _, group in event_and_groups if group)
        group_to_state = yield defer.gatherResults(
            [
                self._get_state_for_group(
                    group,
                ).addCallback(lambda state_dict, group: (group, state_dict), group)
                for group in groups
            ],
            consumeErrors=True,
        ).addErrback(unwrapFirstError)
        defer.returnValue({
            group: state_map.values()
            for group, state_map in group_to_state
        })
    @cached(num_args=1)
    def _fetch_events_for_group(self, key, events):
@ -207,16 +193,25 @@ class StateStore(SQLBaseStore):
        events = yield self._get_events(event_ids, get_prev_content=False)
        defer.returnValue(events)
-    @cached(num_args=3, lru=True)
+    @cached(num_args=2, lru=True, max_entries=10000)
-    def _get_state_groups_from_group(self, room_id, group, types):
+    def _get_state_groups_from_group(self, group, types):
        def f(txn):
            if types is not None:
                where_clause = "AND (%s)" % (
                    " OR ".join(["(type = ? AND state_key = ?)"] * len(types)),
                )
            else:
                where_clause = ""
            sql = (
                "SELECT event_id FROM state_groups_state WHERE"
-                " room_id = ? AND state_group = ? AND (%s)"
+                " state_group = ? %s"
-            ) % (" OR ".join(["(type = ? AND state_key = ?)"] * len(types)),)
+            ) % (where_clause,)
            args = [group]
            if types is not None:
                args.extend([i for typ in types for i in typ])
            args = [room_id, group]
            args.extend([i for typ in types for i in typ])
            txn.execute(sql, args)
            return group, [
@ -229,7 +224,7 @@ class StateStore(SQLBaseStore):
            f,
        )
-    @cached(num_args=3, lru=True, max_entries=100000)
+    @cached(num_args=3, lru=True, max_entries=20000)
    def _get_state_for_event_id(self, room_id, event_id, types):
        def f(txn):
            type_and_state_sql = " OR ".join([
@ -280,40 +275,33 @@ class StateStore(SQLBaseStore):
            deferred: A list of dicts corresponding to the event_ids given.
            The dicts are mappings from (type, state_key) -> state_events
        """
-        set_types = frozenset(types)
+        event_and_groups = yield defer.gatherResults(
        res = yield defer.gatherResults(
            [
-                self._get_state_for_event_id(
+                self._get_state_group_for_event(
-                    room_id, event_id, set_types,
+                    room_id, event_id,
-                )
+                ).addCallback(lambda group, event_id: (event_id, group), event_id)
                for event_id in event_ids
            ],
            consumeErrors=True,
        ).addErrback(unwrapFirstError)
-        event_to_state_ids = dict(res)
+        groups = set(group for _, group in event_and_groups)
-        event_dict = yield self._get_events(
+        res = yield defer.gatherResults(
            [
-                item
+                self._get_state_for_group(
-                for lst in event_to_state_ids.values()
+                    group, types
-                for item in lst
+                ).addCallback(lambda state_dict, group: (group, state_dict), group)
                for group in groups
            ],
-            get_prev_content=False
+            consumeErrors=True,
-        ).addCallback(
+        ).addErrback(unwrapFirstError)
-            lambda evs: {ev.event_id: ev for ev in evs}
+
-        )
+        group_to_state = dict(res)
        event_to_state = {
-            event_id: {
+            event_id: group_to_state[group]
-                (ev.type, ev.state_key): ev
+            for event_id, group in event_and_groups
                for ev in [
                    event_dict[state_id]
                    for state_id in state_ids
                    if state_id in event_dict
                ]
            }
            for event_id, state_ids in event_to_state_ids.items()
        }
        defer.returnValue([
@ -321,6 +309,79 @@ class StateStore(SQLBaseStore):
            for event in event_ids
        ])
    @cached(num_args=2, lru=True, max_entries=100000)
    def _get_state_group_for_event(self, room_id, event_id):
        return self._simple_select_one_onecol(
            table="event_to_state_groups",
            keyvalues={
                "event_id": event_id,
            },
            retcol="state_group",
            allow_none=True,
            desc="_get_state_group_for_event",
        )
    @defer.inlineCallbacks
    def _get_state_for_group(self, group, types=None):
        is_all, state_dict = self._state_group_cache.get(group)
        type_to_key = {}
        missing_types = set()
        if types is not None:
            for typ, state_key in types:
                if state_key is None:
                    type_to_key[typ] = None
                    missing_types.add((typ, state_key))
                else:
                    if type_to_key.get(typ, object()) is not None:
                        type_to_key.setdefault(typ, set()).add(state_key)
                    if (typ, state_key) not in state_dict:
                        missing_types.add((typ, state_key))
        if is_all and types is None:
            defer.returnValue(state_dict)
        if is_all or (types is not None and not missing_types):
            def include(typ, state_key):
                sentinel = object()
                valid_state_keys = type_to_key.get(typ, sentinel)
                if valid_state_keys is sentinel:
                    return False
                if valid_state_keys is None:
                    return True
                if state_key in valid_state_keys:
                    return True
                return False
            defer.returnValue({
                k: v
                for k, v in state_dict.items()
                if include(k[0], k[1])
            })
        # Okay, so we have some missing_types, lets fetch them.
        cache_seq_num = self._state_group_cache.sequence
        _, state_ids = yield self._get_state_groups_from_group(
            group,
            frozenset(types) if types else None
        )
        state_events = yield self._get_events(state_ids, get_prev_content=False)
        state_dict = {
            (e.type, e.state_key): e
            for e in state_events
        }
        # Update the cache
        self._state_group_cache.update(
            cache_seq_num,
            key=group,
            value=state_dict,
            full=(types is None),
        )
        defer.returnValue(state_dict)
 def _make_group_id(clock):
    return str(int(clock.time_msec())) + random_string(5)
--- a/synapse/storage/stream.py
+++ b/synapse/storage/stream.py
@ -300,8 +300,7 @@ class StreamStore(SQLBaseStore):
        defer.returnValue((events, token))
    @defer.inlineCallbacks
-    def get_recent_events_for_room(self, room_id, limit, end_token,
+    def get_recent_events_for_room(self, room_id, limit, end_token, from_token=None):
                                   with_feedback=False, from_token=None):
        # TODO (erikj): Handle compressed feedback
        end_token = RoomStreamToken.parse_stream_token(end_token)
--- a/synapse/util/dictionary_cache.py
+++ b/synapse/util/dictionary_cache.py
@ -16,6 +16,10 @@
 from synapse.util.lrucache import LruCache
 from collections import namedtuple
 import threading
 import logging
 logger = logging.getLogger(__name__)
 DictionaryEntry = namedtuple("DictionaryEntry", ("full", "value"))
@ -47,21 +51,25 @@ class DictionaryCache(object):
                )
    def get(self, key, dict_keys=None):
-        entry = self.cache.get(key, self.sentinel)
+        try:
-        if entry is not self.sentinel:
+            entry = self.cache.get(key, self.sentinel)
-            # cache_counter.inc_hits(self.name)
+            if entry is not self.sentinel:
                # cache_counter.inc_hits(self.name)
-            if dict_keys is None:
+                if dict_keys is None:
-                return DictionaryEntry(entry.full, dict(entry.value))
+                    return DictionaryEntry(entry.full, dict(entry.value))
-            else:
+                else:
-                return DictionaryEntry(entry.full, {
+                    return DictionaryEntry(entry.full, {
-                    k: entry.value[k]
+                        k: entry.value[k]
-                    for k in dict_keys
+                        for k in dict_keys
-                    if k in entry.value
+                        if k in entry.value
-                })
+                    })
-        # cache_counter.inc_misses(self.name)
+            # cache_counter.inc_misses(self.name)
-        return DictionaryEntry(False, {})
+            return DictionaryEntry(False, {})
        except:
            logger.exception("get failed")
            raise
    def invalidate(self, key):
        self.check_thread()
@ -77,14 +85,18 @@ class DictionaryCache(object):
        self.cache.clear()
    def update(self, sequence, key, value, full=False):
-        self.check_thread()
+        try:
-        if self.sequence == sequence:
+            self.check_thread()
-            # Only update the cache if the caches sequence number matches the
+            if self.sequence == sequence:
-            # number that the cache had before the SELECT was started (SYN-369)
+                # Only update the cache if the caches sequence number matches the
-            if full:
+                # number that the cache had before the SELECT was started (SYN-369)
-                self._insert(key, value)
+                if full:
-            else:
+                    self._insert(key, value)
-                self._update_or_insert(key, value)
+                else:
                    self._update_or_insert(key, value)
        except:
            logger.exception("update failed")
            raise
    def _update_or_insert(self, key, value):
        entry = self.cache.setdefault(key, DictionaryEntry(False, {}))
--- a/tests/test_state.py
+++ b/tests/test_state.py
@ -69,7 +69,7 @@ class StateGroupStore(object):
        self._next_group = 1
-    def get_state_groups(self, event_ids):
+    def get_state_groups(self, room_id, event_ids):
        groups = {}
        for event_id in event_ids:
            group = self._event_to_state_group.get(event_id)