Add metrics to track `/messages` response time by room size (#13545)

Follow-up to https://github.com/matrix-org/synapse/pull/13533 Part of https://github.com/matrix-org/synapse/issues/13356
2022-08-18 14:15:53 -05:00 · 2022-08-18 14:15:53 -05:00 · 2c42673a9b
parent b251cff819
commit 2c42673a9b
2 changed files with 54 additions and 2 deletions
--- a/changelog.d/13545.misc
+++ b/changelog.d/13545.misc
@ -0,0 +1 @@
+Update metrics to track `/messages` response time by room size.
--- a/synapse/rest/client/room.py
+++ b/synapse/rest/client/room.py
@ -16,6 +16,7 @@
 """ This module contains REST servlets to do with rooms: /rooms/<paths> """
 import logging
 import re
+from enum import Enum
 from typing import TYPE_CHECKING, Awaitable, Dict, List, Optional, Tuple
 from urllib import parse as urlparse

@ -48,6 +49,7 @@ from synapse.http.servlet import (
    parse_strings_from_args,
 )
 from synapse.http.site import SynapseRequest
+from synapse.logging.context import make_deferred_yieldable, run_in_background
 from synapse.logging.opentracing import set_tag
 from synapse.rest.client._base import client_patterns
 from synapse.rest.client.transactions import HttpTransactionCache
@ -62,6 +64,33 @@ if TYPE_CHECKING:

 logger = logging.getLogger(__name__)

+
+class _RoomSize(Enum):
+    """
+    Enum to differentiate sizes of rooms. This is a pretty good approximation
+    about how hard it will be to get events in the room. We could also look at
+    room "complexity".
+    """
+
+    # This doesn't necessarily mean the room is a DM, just that there is a DM
+    # amount of people there.
+    DM_SIZE = "direct_message_size"
+    SMALL = "small"
+    SUBSTANTIAL = "substantial"
+    LARGE = "large"
+
+    @staticmethod
+    def from_member_count(member_count: int) -> "_RoomSize":
+        if member_count <= 2:
+            return _RoomSize.DM_SIZE
+        elif member_count < 100:
+            return _RoomSize.SMALL
+        elif member_count < 1000:
+            return _RoomSize.SUBSTANTIAL
+        else:
+            return _RoomSize.LARGE
+
+
 # This is an extra metric on top of `synapse_http_server_response_time_seconds`
 # which times the same sort of thing but this one allows us to see values
 # greater than 10s. We use a separate dedicated histogram with its own buckets
@ -70,7 +99,11 @@ logger = logging.getLogger(__name__)
 messsages_response_timer = Histogram(
    "synapse_room_message_list_rest_servlet_response_time_seconds",
    "sec",
-    [],
+    # We have a label for room size so we can try to see a more realistic
+    # picture of /messages response time for bigger rooms. We don't want the
+    # tiny rooms that can always respond fast skewing our results when we're trying
+    # to optimize the bigger cases.
+    ["room_size"],
    buckets=(
        0.005,
        0.01,
@ -587,14 +620,26 @@ class RoomMessageListRestServlet(RestServlet):
    def __init__(self, hs: "HomeServer"):
        super().__init__()
        self._hs = hs
+        self.clock = hs.get_clock()
        self.pagination_handler = hs.get_pagination_handler()
        self.auth = hs.get_auth()
        self.store = hs.get_datastores().main

-    @messsages_response_timer.time()
    async def on_GET(
        self, request: SynapseRequest, room_id: str
    ) -> Tuple[int, JsonDict]:
+        processing_start_time = self.clock.time_msec()
+        # Fire off and hope that we get a result by the end.
+        #
+        # We're using the mypy type ignore comment because the `@cached`
+        # decorator on `get_number_joined_users_in_room` doesn't play well with
+        # the type system. Maybe in the future, it can use some ParamSpec
+        # wizardry to fix it up.
+        room_member_count_deferred = run_in_background(  # type: ignore[call-arg]
+            self.store.get_number_joined_users_in_room,
+            room_id,  # type: ignore[arg-type]
+        )
+
        requester = await self.auth.get_user_by_req(request, allow_guest=True)
        pagination_config = await PaginationConfig.from_request(
            self.store, request, default_limit=10
@ -625,6 +670,12 @@ class RoomMessageListRestServlet(RestServlet):
            event_filter=event_filter,
        )

+        processing_end_time = self.clock.time_msec()
+        room_member_count = await make_deferred_yieldable(room_member_count_deferred)
+        messsages_response_timer.labels(
+            room_size=_RoomSize.from_member_count(room_member_count)
+        ).observe((processing_start_time - processing_end_time) / 1000)
+
        return 200, msgs
				`@ -0,0 +1 @@`
				Update metrics to track `/messages` response time by room size.