synapse-old/synapse/handlers/federation.py

1974 lines
82 KiB
Python

# Copyright 2014-2022 The Matrix.org Foundation C.I.C.
# Copyright 2020 Sorunome
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains handlers for federation events."""
import enum
import itertools
import logging
from enum import Enum
from http import HTTPStatus
from typing import (
TYPE_CHECKING,
AbstractSet,
Dict,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)
import attr
from prometheus_client import Histogram
from signedjson.key import decode_verify_key_bytes
from signedjson.sign import verify_signed_json
from unpaddedbase64 import decode_base64
from synapse import event_auth
from synapse.api.constants import MAX_DEPTH, EventContentFields, EventTypes, Membership
from synapse.api.errors import (
AuthError,
CodeMessageException,
Codes,
FederationDeniedError,
FederationError,
FederationPullAttemptBackoffError,
HttpResponseException,
NotFoundError,
RequestSendFailed,
SynapseError,
)
from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion
from synapse.crypto.event_signing import compute_event_signature
from synapse.event_auth import validate_event_for_room_version
from synapse.events import EventBase
from synapse.events.snapshot import EventContext
from synapse.events.validator import EventValidator
from synapse.federation.federation_client import InvalidResponseError
from synapse.http.servlet import assert_params_in_dict
from synapse.logging.context import nested_logging_context
from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.module_api import NOT_SPAM
from synapse.replication.http.federation import (
ReplicationCleanRoomRestServlet,
ReplicationStoreRoomOnOutlierMembershipRestServlet,
)
from synapse.storage.databases.main.events import PartialStateConflictError
from synapse.storage.databases.main.events_worker import EventRedactBehaviour
from synapse.types import JsonDict, StrCollection, get_domain_from_id
from synapse.types.state import StateFilter
from synapse.util.async_helpers import Linearizer
from synapse.util.retryutils import NotRetryingDestination
from synapse.visibility import filter_events_for_server
if TYPE_CHECKING:
from synapse.server import HomeServer
logger = logging.getLogger(__name__)
# Added to debug performance and track progress on optimizations
backfill_processing_before_timer = Histogram(
"synapse_federation_backfill_processing_before_time_seconds",
"sec",
[],
buckets=(
0.1,
0.5,
1.0,
2.5,
5.0,
7.5,
10.0,
15.0,
20.0,
30.0,
40.0,
60.0,
80.0,
"+Inf",
),
)
class _BackfillPointType(Enum):
# a regular backwards extremity (ie, an event which we don't yet have, but which
# is referred to by other events in the DAG)
BACKWARDS_EXTREMITY = enum.auto()
# an MSC2716 "insertion event"
INSERTION_PONT = enum.auto()
@attr.s(slots=True, auto_attribs=True, frozen=True)
class _BackfillPoint:
"""A potential point we might backfill from"""
event_id: str
depth: int
type: _BackfillPointType
class FederationHandler:
"""Handles general incoming federation requests
Incoming events are *not* handled here, for which see FederationEventHandler.
"""
def __init__(self, hs: "HomeServer"):
self.hs = hs
self.clock = hs.get_clock()
self.store = hs.get_datastores().main
self._storage_controllers = hs.get_storage_controllers()
self._state_storage_controller = self._storage_controllers.state
self.federation_client = hs.get_federation_client()
self.state_handler = hs.get_state_handler()
self.server_name = hs.hostname
self.keyring = hs.get_keyring()
self.is_mine_id = hs.is_mine_id
self.spam_checker = hs.get_spam_checker()
self.event_creation_handler = hs.get_event_creation_handler()
self.event_builder_factory = hs.get_event_builder_factory()
self._event_auth_handler = hs.get_event_auth_handler()
self._server_notices_mxid = hs.config.servernotices.server_notices_mxid
self.config = hs.config
self.http_client = hs.get_proxied_blacklisted_http_client()
self._replication = hs.get_replication_data_handler()
self._federation_event_handler = hs.get_federation_event_handler()
self._device_handler = hs.get_device_handler()
self._bulk_push_rule_evaluator = hs.get_bulk_push_rule_evaluator()
self._notifier = hs.get_notifier()
self._clean_room_for_join_client = ReplicationCleanRoomRestServlet.make_client(
hs
)
if hs.config.worker.worker_app:
self._maybe_store_room_on_outlier_membership = (
ReplicationStoreRoomOnOutlierMembershipRestServlet.make_client(hs)
)
else:
self._maybe_store_room_on_outlier_membership = (
self.store.maybe_store_room_on_outlier_membership
)
self._room_backfill = Linearizer("room_backfill")
self.third_party_event_rules = hs.get_third_party_event_rules()
# Tracks running partial state syncs by room ID.
# Partial state syncs currently only run on the main process, so it's okay to
# track them in-memory for now.
self._active_partial_state_syncs: Set[str] = set()
# Tracks partial state syncs we may want to restart.
# A dictionary mapping room IDs to (initial destination, other destinations)
# tuples.
self._partial_state_syncs_maybe_needing_restart: Dict[
str, Tuple[Optional[str], AbstractSet[str]]
] = {}
# A lock guarding the partial state flag for rooms.
# When the lock is held for a given room, no other concurrent code may
# partial state or un-partial state the room.
self._is_partial_state_room_linearizer = Linearizer(
name="_is_partial_state_room_linearizer"
)
# if this is the main process, fire off a background process to resume
# any partial-state-resync operations which were in flight when we
# were shut down.
if not hs.config.worker.worker_app:
run_as_background_process(
"resume_sync_partial_state_room", self._resume_partial_state_room_sync
)
@trace
async def maybe_backfill(
self, room_id: str, current_depth: int, limit: int
) -> bool:
"""Checks the database to see if we should backfill before paginating,
and if so do.
Args:
room_id
current_depth: The depth from which we're paginating from. This is
used to decide if we should backfill and what extremities to
use.
limit: The number of events that the pagination request will
return. This is used as part of the heuristic to decide if we
should back paginate.
"""
# Starting the processing time here so we can include the room backfill
# linearizer lock queue in the timing
processing_start_time = self.clock.time_msec()
async with self._room_backfill.queue(room_id):
return await self._maybe_backfill_inner(
room_id,
current_depth,
limit,
processing_start_time=processing_start_time,
)
async def _maybe_backfill_inner(
self,
room_id: str,
current_depth: int,
limit: int,
*,
processing_start_time: Optional[int],
) -> bool:
"""
Checks whether the `current_depth` is at or approaching any backfill
points in the room and if so, will backfill. We only care about
checking backfill points that happened before the `current_depth`
(meaning less than or equal to the `current_depth`).
Args:
room_id: The room to backfill in.
current_depth: The depth to check at for any upcoming backfill points.
limit: The max number of events to request from the remote federated server.
processing_start_time: The time when `maybe_backfill` started processing.
Only used for timing. If `None`, no timing observation will be made.
"""
backwards_extremities = [
_BackfillPoint(event_id, depth, _BackfillPointType.BACKWARDS_EXTREMITY)
for event_id, depth in await self.store.get_backfill_points_in_room(
room_id=room_id,
current_depth=current_depth,
# We only need to end up with 5 extremities combined with the
# insertion event extremities to make the `/backfill` request
# but fetch an order of magnitude more to make sure there is
# enough even after we filter them by whether visible in the
# history. This isn't fool-proof as all backfill points within
# our limit could be filtered out but seems like a good amount
# to try with at least.
limit=50,
)
]
insertion_events_to_be_backfilled: List[_BackfillPoint] = []
if self.hs.config.experimental.msc2716_enabled:
insertion_events_to_be_backfilled = [
_BackfillPoint(event_id, depth, _BackfillPointType.INSERTION_PONT)
for event_id, depth in await self.store.get_insertion_event_backward_extremities_in_room(
room_id=room_id,
current_depth=current_depth,
# We only need to end up with 5 extremities combined with
# the backfill points to make the `/backfill` request ...
# (see the other comment above for more context).
limit=50,
)
]
logger.debug(
"_maybe_backfill_inner: backwards_extremities=%s insertion_events_to_be_backfilled=%s",
backwards_extremities,
insertion_events_to_be_backfilled,
)
# we now have a list of potential places to backpaginate from. We prefer to
# start with the most recent (ie, max depth), so let's sort the list.
sorted_backfill_points: List[_BackfillPoint] = sorted(
itertools.chain(
backwards_extremities,
insertion_events_to_be_backfilled,
),
key=lambda e: -int(e.depth),
)
logger.debug(
"_maybe_backfill_inner: room_id: %s: current_depth: %s, limit: %s, "
"backfill points (%d): %s",
room_id,
current_depth,
limit,
len(sorted_backfill_points),
sorted_backfill_points,
)
# If we have no backfill points lower than the `current_depth` then
# either we can a) bail or b) still attempt to backfill. We opt to try
# backfilling anyway just in case we do get relevant events.
if not sorted_backfill_points and current_depth != MAX_DEPTH:
logger.debug(
"_maybe_backfill_inner: all backfill points are *after* current depth. Trying again with later backfill points."
)
return await self._maybe_backfill_inner(
room_id=room_id,
# We use `MAX_DEPTH` so that we find all backfill points next
# time (all events are below the `MAX_DEPTH`)
current_depth=MAX_DEPTH,
limit=limit,
# We don't want to start another timing observation from this
# nested recursive call. The top-most call can record the time
# overall otherwise the smaller one will throw off the results.
processing_start_time=None,
)
# Even after recursing with `MAX_DEPTH`, we didn't find any
# backward extremities to backfill from.
if not sorted_backfill_points:
logger.debug(
"_maybe_backfill_inner: Not backfilling as no backward extremeties found."
)
return False
# If we're approaching an extremity we trigger a backfill, otherwise we
# no-op.
#
# We chose twice the limit here as then clients paginating backwards
# will send pagination requests that trigger backfill at least twice
# using the most recent extremity before it gets removed (see below). We
# chose more than one times the limit in case of failure, but choosing a
# much larger factor will result in triggering a backfill request much
# earlier than necessary.
max_depth_of_backfill_points = sorted_backfill_points[0].depth
if current_depth - 2 * limit > max_depth_of_backfill_points:
logger.debug(
"Not backfilling as we don't need to. %d < %d - 2 * %d",
max_depth_of_backfill_points,
current_depth,
limit,
)
return False
# For performance's sake, we only want to paginate from a particular extremity
# if we can actually see the events we'll get. Otherwise, we'd just spend a lot
# of resources to get redacted events. We check each extremity in turn and
# ignore those which users on our server wouldn't be able to see.
#
# Additionally, we limit ourselves to backfilling from at most 5 extremities,
# for two reasons:
#
# - The check which determines if we can see an extremity's events can be
# expensive (we load the full state for the room at each of the backfill
# points, or (worse) their successors)
# - We want to avoid the server-server API request URI becoming too long.
#
# *Note*: the spec wants us to keep backfilling until we reach the start
# of the room in case we are allowed to see some of the history. However,
# in practice that causes more issues than its worth, as (a) it's
# relatively rare for there to be any visible history and (b) even when
# there is it's often sufficiently long ago that clients would stop
# attempting to paginate before backfill reached the visible history.
extremities_to_request: List[str] = []
for bp in sorted_backfill_points:
if len(extremities_to_request) >= 5:
break
# For regular backwards extremities, we don't have the extremity events
# themselves, so we need to actually check the events that reference them -
# their "successor" events.
#
# TODO: Correctly handle the case where we are allowed to see the
# successor event but not the backward extremity, e.g. in the case of
# initial join of the server where we are allowed to see the join
# event but not anything before it. This would require looking at the
# state *before* the event, ignoring the special casing certain event
# types have.
if bp.type == _BackfillPointType.INSERTION_PONT:
event_ids_to_check = [bp.event_id]
else:
event_ids_to_check = await self.store.get_successor_events(bp.event_id)
events_to_check = await self.store.get_events_as_list(
event_ids_to_check,
redact_behaviour=EventRedactBehaviour.as_is,
get_prev_content=False,
)
# We set `check_history_visibility_only` as we might otherwise get false
# positives from users having been erased.
filtered_extremities = await filter_events_for_server(
self._storage_controllers,
self.server_name,
self.server_name,
events_to_check,
redact=False,
check_history_visibility_only=True,
)
if filtered_extremities:
extremities_to_request.append(bp.event_id)
else:
logger.debug(
"_maybe_backfill_inner: skipping extremity %s as it would not be visible",
bp,
)
if not extremities_to_request:
logger.debug(
"_maybe_backfill_inner: found no extremities which would be visible"
)
return False
logger.debug(
"_maybe_backfill_inner: extremities_to_request %s", extremities_to_request
)
set_tag(
SynapseTags.RESULT_PREFIX + "extremities_to_request",
str(extremities_to_request),
)
set_tag(
SynapseTags.RESULT_PREFIX + "extremities_to_request.length",
str(len(extremities_to_request)),
)
# Now we need to decide which hosts to hit first.
# First we try hosts that are already in the room.
# TODO: HEURISTIC ALERT.
likely_domains = (
await self._storage_controllers.state.get_current_hosts_in_room_ordered(
room_id
)
)
async def try_backfill(domains: StrCollection) -> bool:
# TODO: Should we try multiple of these at a time?
# Number of contacted remote homeservers that have denied our backfill
# request with a 4xx code.
denied_count = 0
# Maximum number of contacted remote homeservers that can deny our
# backfill request with 4xx codes before we give up.
max_denied_count = 5
for dom in domains:
# We don't want to ask our own server for information we don't have
if dom == self.server_name:
continue
try:
await self._federation_event_handler.backfill(
dom, room_id, limit=100, extremities=extremities_to_request
)
# If this succeeded then we probably already have the
# appropriate stuff.
# TODO: We can probably do something more intelligent here.
return True
except NotRetryingDestination as e:
logger.info("_maybe_backfill_inner: %s", e)
continue
except FederationDeniedError:
logger.info(
"_maybe_backfill_inner: Not attempting to backfill from %s because the homeserver is not on our federation whitelist",
dom,
)
continue
except (SynapseError, InvalidResponseError) as e:
logger.info("Failed to backfill from %s because %s", dom, e)
continue
except HttpResponseException as e:
if 400 <= e.code < 500:
logger.warning(
"Backfill denied from %s because %s [%d/%d]",
dom,
e,
denied_count,
max_denied_count,
)
denied_count += 1
if denied_count >= max_denied_count:
return False
continue
logger.info("Failed to backfill from %s because %s", dom, e)
continue
except CodeMessageException as e:
if 400 <= e.code < 500:
logger.warning(
"Backfill denied from %s because %s [%d/%d]",
dom,
e,
denied_count,
max_denied_count,
)
denied_count += 1
if denied_count >= max_denied_count:
return False
continue
logger.info("Failed to backfill from %s because %s", dom, e)
continue
except RequestSendFailed as e:
logger.info("Failed to get backfill from %s because %s", dom, e)
continue
except Exception as e:
logger.exception("Failed to backfill from %s because %s", dom, e)
continue
return False
# If we have the `processing_start_time`, then we can make an
# observation. We wouldn't have the `processing_start_time` in the case
# where `_maybe_backfill_inner` is recursively called to find any
# backfill points regardless of `current_depth`.
if processing_start_time is not None:
processing_end_time = self.clock.time_msec()
backfill_processing_before_timer.observe(
(processing_end_time - processing_start_time) / 1000
)
success = await try_backfill(likely_domains)
if success:
return True
# TODO: we could also try servers which were previously in the room, but
# are no longer.
return False
async def send_invite(self, target_host: str, event: EventBase) -> EventBase:
"""Sends the invite to the remote server for signing.
Invites must be signed by the invitee's server before distribution.
"""
try:
pdu = await self.federation_client.send_invite(
destination=target_host,
room_id=event.room_id,
event_id=event.event_id,
pdu=event,
)
except RequestSendFailed:
raise SynapseError(502, f"Can't connect to server {target_host}")
return pdu
async def on_event_auth(self, event_id: str) -> List[EventBase]:
event = await self.store.get_event(event_id)
auth = await self.store.get_auth_chain(
event.room_id, list(event.auth_event_ids()), include_given=True
)
return list(auth)
async def do_invite_join(
self, target_hosts: Iterable[str], room_id: str, joinee: str, content: JsonDict
) -> Tuple[str, int]:
"""Attempts to join the `joinee` to the room `room_id` via the
servers contained in `target_hosts`.
This first triggers a /make_join/ request that returns a partial
event that we can fill out and sign. This is then sent to the
remote server via /send_join/ which responds with the state at that
event and the auth_chains.
We suspend processing of any received events from this room until we
have finished processing the join.
Args:
target_hosts: List of servers to attempt to join the room with.
room_id: The ID of the room to join.
joinee: The User ID of the joining user.
content: The event content to use for the join event.
"""
# TODO: We should be able to call this on workers, but the upgrading of
# room stuff after join currently doesn't work on workers.
# TODO: Before we relax this condition, we need to allow re-syncing of
# partial room state to happen on workers.
assert self.config.worker.worker_app is None
logger.debug("Joining %s to %s", joinee, room_id)
origin, event, room_version_obj = await self._make_and_verify_event(
target_hosts,
room_id,
joinee,
"join",
content,
params={"ver": KNOWN_ROOM_VERSIONS},
)
# This shouldn't happen, because the RoomMemberHandler has a
# linearizer lock which only allows one operation per user per room
# at a time - so this is just paranoia.
assert room_id not in self._federation_event_handler.room_queues
self._federation_event_handler.room_queues[room_id] = []
is_host_joined = await self.store.is_host_joined(room_id, self.server_name)
if not is_host_joined:
# We may have old forward extremities lying around if the homeserver left
# the room completely in the past. Clear them out.
#
# Note that this check-then-clear is subject to races where
# * the homeserver is in the room and stops being in the room just after
# the check. We won't reset the forward extremities, but that's okay,
# since they will be almost up to date.
# * the homeserver is not in the room and starts being in the room just
# after the check. This can't happen, since `RoomMemberHandler` has a
# linearizer lock which prevents concurrent remote joins into the same
# room.
# In short, the races either have an acceptable outcome or should be
# impossible.
await self._clean_room_for_join(room_id)
try:
# Try the host we successfully got a response to /make_join/
# request first.
host_list = list(target_hosts)
try:
host_list.remove(origin)
host_list.insert(0, origin)
except ValueError:
pass
async with self._is_partial_state_room_linearizer.queue(room_id):
already_partial_state_room = await self.store.is_partial_state_room(
room_id
)
ret = await self.federation_client.send_join(
host_list,
event,
room_version_obj,
# Perform a full join when we are already in the room and it is a
# full state room, since we are not allowed to persist a partial
# state join event in a full state room. In the future, we could
# optimize this by always performing a partial state join and
# computing the state ourselves or retrieving it from the remote
# homeserver if necessary.
#
# There's a race where we leave the room, then perform a full join
# anyway. This should end up being fast anyway, since we would
# already have the full room state and auth chain persisted.
partial_state=not is_host_joined or already_partial_state_room,
)
event = ret.event
origin = ret.origin
state = ret.state
auth_chain = ret.auth_chain
auth_chain.sort(key=lambda e: e.depth)
logger.debug("do_invite_join auth_chain: %s", auth_chain)
logger.debug("do_invite_join state: %s", state)
logger.debug("do_invite_join event: %s", event)
# if this is the first time we've joined this room, it's time to add
# a row to `rooms` with the correct room version. If there's already a
# row there, we should override it, since it may have been populated
# based on an invite request which lied about the room version.
#
# federation_client.send_join has already checked that the room
# version in the received create event is the same as room_version_obj,
# so we can rely on it now.
#
await self.store.upsert_room_on_join(
room_id=room_id,
room_version=room_version_obj,
state_events=state,
)
if ret.partial_state and not already_partial_state_room:
# Mark the room as having partial state.
# The background process is responsible for unmarking this flag,
# even if the join fails.
# TODO(faster_joins):
# We may want to reset the partial state info if it's from an
# old, failed partial state join.
# https://github.com/matrix-org/synapse/issues/13000
await self.store.store_partial_state_room(
room_id=room_id,
servers=ret.servers_in_room,
device_lists_stream_id=self.store.get_device_stream_token(),
joined_via=origin,
)
try:
max_stream_id = (
await self._federation_event_handler.process_remote_join(
origin,
room_id,
auth_chain,
state,
event,
room_version_obj,
partial_state=ret.partial_state,
)
)
except PartialStateConflictError:
# This should be impossible, since we hold the lock on the room's
# partial statedness.
logger.error(
"Room %s was un-partial stated while processing remote join.",
room_id,
)
raise
else:
# Record the join event id for future use (when we finish the full
# join). We have to do this after persisting the event to keep
# foreign key constraints intact.
if ret.partial_state and not already_partial_state_room:
# TODO(faster_joins):
# We may want to reset the partial state info if it's from
# an old, failed partial state join.
# https://github.com/matrix-org/synapse/issues/13000
await self.store.write_partial_state_rooms_join_event_id(
room_id, event.event_id
)
finally:
# Always kick off the background process that asynchronously fetches
# state for the room.
# If the join failed, the background process is responsible for
# cleaning up — including unmarking the room as a partial state
# room.
if ret.partial_state:
# Kick off the process of asynchronously fetching the state for
# this room.
self._start_partial_state_room_sync(
initial_destination=origin,
other_destinations=ret.servers_in_room,
room_id=room_id,
)
# We wait here until this instance has seen the events come down
# replication (if we're using replication) as the below uses caches.
await self._replication.wait_for_stream_position(
self.config.worker.events_shard_config.get_instance(room_id),
"events",
max_stream_id,
)
# Check whether this room is the result of an upgrade of a room we already know
# about. If so, migrate over user information
predecessor = await self.store.get_room_predecessor(room_id)
if not predecessor or not isinstance(predecessor.get("room_id"), str):
return event.event_id, max_stream_id
old_room_id = predecessor["room_id"]
logger.debug(
"Found predecessor for %s during remote join: %s", room_id, old_room_id
)
# We retrieve the room member handler here as to not cause a cyclic dependency
member_handler = self.hs.get_room_member_handler()
await member_handler.transfer_room_state_on_room_upgrade(
old_room_id, room_id
)
logger.debug("Finished joining %s to %s", joinee, room_id)
return event.event_id, max_stream_id
finally:
room_queue = self._federation_event_handler.room_queues[room_id]
del self._federation_event_handler.room_queues[room_id]
# we don't need to wait for the queued events to be processed -
# it's just a best-effort thing at this point. We do want to do
# them roughly in order, though, otherwise we'll end up making
# lots of requests for missing prev_events which we do actually
# have. Hence we fire off the background task, but don't wait for it.
run_as_background_process(
"handle_queued_pdus", self._handle_queued_pdus, room_queue
)
async def do_knock(
self,
target_hosts: List[str],
room_id: str,
knockee: str,
content: JsonDict,
) -> Tuple[str, int]:
"""Sends the knock to the remote server.
This first triggers a make_knock request that returns a partial
event that we can fill out and sign. This is then sent to the
remote server via send_knock.
Knock events must be signed by the knockee's server before distributing.
Args:
target_hosts: A list of hosts that we want to try knocking through.
room_id: The ID of the room to knock on.
knockee: The ID of the user who is knocking.
content: The content of the knock event.
Returns:
A tuple of (event ID, stream ID).
Raises:
SynapseError: If the chosen remote server returns a 3xx/4xx code.
RuntimeError: If no servers were reachable.
"""
logger.debug("Knocking on room %s on behalf of user %s", room_id, knockee)
# Inform the remote server of the room versions we support
supported_room_versions = list(KNOWN_ROOM_VERSIONS.keys())
# Ask the remote server to create a valid knock event for us. Once received,
# we sign the event
params: Dict[str, Iterable[str]] = {"ver": supported_room_versions}
origin, event, event_format_version = await self._make_and_verify_event(
target_hosts, room_id, knockee, Membership.KNOCK, content, params=params
)
# Mark the knock as an outlier as we don't yet have the state at this point in
# the DAG.
event.internal_metadata.outlier = True
# ... but tell /sync to send it to clients anyway.
event.internal_metadata.out_of_band_membership = True
# Record the room ID and its version so that we have a record of the room
await self._maybe_store_room_on_outlier_membership(
room_id=event.room_id, room_version=event_format_version
)
# Initially try the host that we successfully called /make_knock on
try:
target_hosts.remove(origin)
target_hosts.insert(0, origin)
except ValueError:
pass
# Send the signed event back to the room, and potentially receive some
# further information about the room in the form of partial state events
knock_response = await self.federation_client.send_knock(target_hosts, event)
# Store any stripped room state events in the "unsigned" key of the event.
# This is a bit of a hack and is cribbing off of invites. Basically we
# store the room state here and retrieve it again when this event appears
# in the invitee's sync stream. It is stripped out for all other local users.
stripped_room_state = (
knock_response.get("knock_room_state")
# Since v1.37, Synapse incorrectly used "knock_state_events" for this field.
# Thus, we also check for a 'knock_state_events' to support old instances.
# See https://github.com/matrix-org/synapse/issues/14088.
or knock_response.get("knock_state_events")
)
if stripped_room_state is None:
raise KeyError(
"Missing 'knock_room_state' (or legacy 'knock_state_events') field in "
"send_knock response"
)
event.unsigned["knock_room_state"] = stripped_room_state
context = EventContext.for_outlier(self._storage_controllers)
stream_id = await self._federation_event_handler.persist_events_and_notify(
event.room_id, [(event, context)]
)
return event.event_id, stream_id
async def _handle_queued_pdus(
self, room_queue: List[Tuple[EventBase, str]]
) -> None:
"""Process PDUs which got queued up while we were busy send_joining.
Args:
room_queue: list of PDUs to be processed and the servers that sent them
"""
for p, origin in room_queue:
try:
logger.info(
"Processing queued PDU %s which was received while we were joining",
p,
)
with nested_logging_context(p.event_id):
await self._federation_event_handler.on_receive_pdu(origin, p)
except Exception as e:
logger.warning(
"Error handling queued PDU %s from %s: %s", p.event_id, origin, e
)
async def on_make_join_request(
self, origin: str, room_id: str, user_id: str
) -> EventBase:
"""We've received a /make_join/ request, so we create a partial
join event for the room and return that. We do *not* persist or
process it until the other server has signed it and sent it back.
Args:
origin: The (verified) server name of the requesting server.
room_id: Room to create join event in
user_id: The user to create the join for
"""
if get_domain_from_id(user_id) != origin:
logger.info(
"Got /make_join request for user %r from different origin %s, ignoring",
user_id,
origin,
)
raise SynapseError(403, "User not from origin", Codes.FORBIDDEN)
# checking the room version will check that we've actually heard of the room
# (and return a 404 otherwise)
room_version = await self.store.get_room_version(room_id)
if await self.store.is_partial_state_room(room_id):
# If our server is still only partially joined, we can't give a complete
# response to /make_join, so return a 404 as we would if we weren't in the
# room at all.
# The main reason we can't respond properly is that we need to know about
# the auth events for the join event that we would return.
# We also should not bother entertaining the /make_join since we cannot
# handle the /send_join.
logger.info(
"Rejecting /make_join to %s because it's a partial state room", room_id
)
raise SynapseError(
404,
"Unable to handle /make_join right now; this server is not fully joined.",
errcode=Codes.NOT_FOUND,
)
# now check that we are *still* in the room
is_in_room = await self._event_auth_handler.is_host_in_room(
room_id, self.server_name
)
if not is_in_room:
logger.info(
"Got /make_join request for room %s we are no longer in",
room_id,
)
raise NotFoundError("Not an active room on this server")
event_content = {"membership": Membership.JOIN}
# If the current room is using restricted join rules, additional information
# may need to be included in the event content in order to efficiently
# validate the event.
#
# Note that this requires the /send_join request to come back to the
# same server.
if room_version.msc3083_join_rules:
state_ids = await self._state_storage_controller.get_current_state_ids(
room_id
)
if await self._event_auth_handler.has_restricted_join_rules(
state_ids, room_version
):
prev_member_event_id = state_ids.get((EventTypes.Member, user_id), None)
# If the user is invited or joined to the room already, then
# no additional info is needed.
include_auth_user_id = True
if prev_member_event_id:
prev_member_event = await self.store.get_event(prev_member_event_id)
include_auth_user_id = prev_member_event.membership not in (
Membership.JOIN,
Membership.INVITE,
)
if include_auth_user_id:
event_content[
EventContentFields.AUTHORISING_USER
] = await self._event_auth_handler.get_user_which_could_invite(
room_id,
state_ids,
)
builder = self.event_builder_factory.for_room_version(
room_version,
{
"type": EventTypes.Member,
"content": event_content,
"room_id": room_id,
"sender": user_id,
"state_key": user_id,
},
)
try:
event, context = await self.event_creation_handler.create_new_client_event(
builder=builder
)
except SynapseError as e:
logger.warning("Failed to create join to %s because %s", room_id, e)
raise
# Ensure the user can even join the room.
await self._federation_event_handler.check_join_restrictions(context, event)
# The remote hasn't signed it yet, obviously. We'll do the full checks
# when we get the event back in `on_send_join_request`
await self._event_auth_handler.check_auth_rules_from_context(event)
return event
async def on_invite_request(
self, origin: str, event: EventBase, room_version: RoomVersion
) -> EventBase:
"""We've got an invite event. Process and persist it. Sign it.
Respond with the now signed event.
"""
if event.state_key is None:
raise SynapseError(400, "The invite event did not have a state key")
is_blocked = await self.store.is_room_blocked(event.room_id)
if is_blocked:
raise SynapseError(403, "This room has been blocked on this server")
if self.hs.config.server.block_non_admin_invites:
raise SynapseError(403, "This server does not accept room invites")
spam_check = await self.spam_checker.user_may_invite(
event.sender, event.state_key, event.room_id
)
if spam_check != NOT_SPAM:
raise SynapseError(
403,
"This user is not permitted to send invites to this server/user",
errcode=spam_check[0],
additional_fields=spam_check[1],
)
membership = event.content.get("membership")
if event.type != EventTypes.Member or membership != Membership.INVITE:
raise SynapseError(400, "The event was not an m.room.member invite event")
sender_domain = get_domain_from_id(event.sender)
if sender_domain != origin:
raise SynapseError(
400, "The invite event was not from the server sending it"
)
if not self.is_mine_id(event.state_key):
raise SynapseError(400, "The invite event must be for this server")
# block any attempts to invite the server notices mxid
if event.state_key == self._server_notices_mxid:
raise SynapseError(HTTPStatus.FORBIDDEN, "Cannot invite this user")
# We retrieve the room member handler here as to not cause a cyclic dependency
member_handler = self.hs.get_room_member_handler()
# We don't rate limit based on room ID, as that should be done by
# sending server.
await member_handler.ratelimit_invite(None, None, event.state_key)
# keep a record of the room version, if we don't yet know it.
# (this may get overwritten if we later get a different room version in a
# join dance).
await self._maybe_store_room_on_outlier_membership(
room_id=event.room_id, room_version=room_version
)
event.internal_metadata.outlier = True
event.internal_metadata.out_of_band_membership = True
event.signatures.update(
compute_event_signature(
room_version,
event.get_pdu_json(),
self.hs.hostname,
self.hs.signing_key,
)
)
context = EventContext.for_outlier(self._storage_controllers)
await self._bulk_push_rule_evaluator.action_for_events_by_user(
[(event, context)]
)
try:
await self._federation_event_handler.persist_events_and_notify(
event.room_id, [(event, context)]
)
except Exception:
await self.store.remove_push_actions_from_staging(event.event_id)
raise
return event
async def do_remotely_reject_invite(
self, target_hosts: Iterable[str], room_id: str, user_id: str, content: JsonDict
) -> Tuple[EventBase, int]:
origin, event, room_version = await self._make_and_verify_event(
target_hosts, room_id, user_id, "leave", content=content
)
# Mark as outlier as we don't have any state for this event; we're not
# even in the room.
event.internal_metadata.outlier = True
event.internal_metadata.out_of_band_membership = True
# Try the host that we successfully called /make_leave/ on first for
# the /send_leave/ request.
host_list = list(target_hosts)
try:
host_list.remove(origin)
host_list.insert(0, origin)
except ValueError:
pass
await self.federation_client.send_leave(host_list, event)
context = EventContext.for_outlier(self._storage_controllers)
stream_id = await self._federation_event_handler.persist_events_and_notify(
event.room_id, [(event, context)]
)
return event, stream_id
async def _make_and_verify_event(
self,
target_hosts: Iterable[str],
room_id: str,
user_id: str,
membership: str,
content: JsonDict,
params: Optional[Dict[str, Union[str, Iterable[str]]]] = None,
) -> Tuple[str, EventBase, RoomVersion]:
(
origin,
event,
room_version,
) = await self.federation_client.make_membership_event(
target_hosts, room_id, user_id, membership, content, params=params
)
logger.debug("Got response to make_%s: %s", membership, event)
# We should assert some things.
# FIXME: Do this in a nicer way
assert event.type == EventTypes.Member
assert event.user_id == user_id
assert event.state_key == user_id
assert event.room_id == room_id
return origin, event, room_version
async def on_make_leave_request(
self, origin: str, room_id: str, user_id: str
) -> EventBase:
"""We've received a /make_leave/ request, so we create a partial
leave event for the room and return that. We do *not* persist or
process it until the other server has signed it and sent it back.
Args:
origin: The (verified) server name of the requesting server.
room_id: Room to create leave event in
user_id: The user to create the leave for
"""
if get_domain_from_id(user_id) != origin:
logger.info(
"Got /make_leave request for user %r from different origin %s, ignoring",
user_id,
origin,
)
raise SynapseError(403, "User not from origin", Codes.FORBIDDEN)
room_version_obj = await self.store.get_room_version(room_id)
builder = self.event_builder_factory.for_room_version(
room_version_obj,
{
"type": EventTypes.Member,
"content": {"membership": Membership.LEAVE},
"room_id": room_id,
"sender": user_id,
"state_key": user_id,
},
)
event, context = await self.event_creation_handler.create_new_client_event(
builder=builder
)
try:
# The remote hasn't signed it yet, obviously. We'll do the full checks
# when we get the event back in `on_send_leave_request`
await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Failed to create new leave %r because %s", event, e)
raise e
return event
async def on_make_knock_request(
self, origin: str, room_id: str, user_id: str
) -> EventBase:
"""We've received a make_knock request, so we create a partial
knock event for the room and return that. We do *not* persist or
process it until the other server has signed it and sent it back.
Args:
origin: The (verified) server name of the requesting server.
room_id: The room to create the knock event in.
user_id: The user to create the knock for.
Returns:
The partial knock event.
"""
if get_domain_from_id(user_id) != origin:
logger.info(
"Get /make_knock request for user %r from different origin %s, ignoring",
user_id,
origin,
)
raise SynapseError(403, "User not from origin", Codes.FORBIDDEN)
room_version_obj = await self.store.get_room_version(room_id)
builder = self.event_builder_factory.for_room_version(
room_version_obj,
{
"type": EventTypes.Member,
"content": {"membership": Membership.KNOCK},
"room_id": room_id,
"sender": user_id,
"state_key": user_id,
},
)
event, context = await self.event_creation_handler.create_new_client_event(
builder=builder
)
event_allowed, _ = await self.third_party_event_rules.check_event_allowed(
event, context
)
if not event_allowed:
logger.warning("Creation of knock %s forbidden by third-party rules", event)
raise SynapseError(
403, "This event is not allowed in this context", Codes.FORBIDDEN
)
try:
# The remote hasn't signed it yet, obviously. We'll do the full checks
# when we get the event back in `on_send_knock_request`
await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Failed to create new knock %r because %s", event, e)
raise e
return event
@trace
@tag_args
async def get_state_ids_for_pdu(self, room_id: str, event_id: str) -> List[str]:
"""Returns the state at the event. i.e. not including said event."""
event = await self.store.get_event(event_id, check_room_id=room_id)
if event.internal_metadata.outlier:
raise NotFoundError("State not known at event %s" % (event_id,))
state_groups = await self._state_storage_controller.get_state_groups_ids(
room_id, [event_id]
)
# get_state_groups_ids should return exactly one result
assert len(state_groups) == 1
state_map = next(iter(state_groups.values()))
state_key = event.get_state_key()
if state_key is not None:
# the event was not rejected (get_event raises a NotFoundError for rejected
# events) so the state at the event should include the event itself.
assert (
state_map.get((event.type, state_key)) == event.event_id
), "State at event did not include event itself"
# ... but we need the state *before* that event
if "replaces_state" in event.unsigned:
prev_id = event.unsigned["replaces_state"]
state_map[(event.type, state_key)] = prev_id
else:
del state_map[(event.type, state_key)]
return list(state_map.values())
async def on_backfill_request(
self, origin: str, room_id: str, pdu_list: List[str], limit: int
) -> List[EventBase]:
# We allow partially joined rooms since in this case we are filtering out
# non-local events in `filter_events_for_server`.
await self._event_auth_handler.assert_host_in_room(room_id, origin, True)
# Synapse asks for 100 events per backfill request. Do not allow more.
limit = min(limit, 100)
events = await self.store.get_backfill_events(room_id, pdu_list, limit)
logger.debug(
"on_backfill_request: backfill events=%s",
[
"event_id=%s,depth=%d,body=%s,prevs=%s\n"
% (
event.event_id,
event.depth,
event.content.get("body", event.type),
event.prev_event_ids(),
)
for event in events
],
)
events = await filter_events_for_server(
self._storage_controllers, origin, self.server_name, events
)
return events
async def get_persisted_pdu(
self, origin: str, event_id: str
) -> Optional[EventBase]:
"""Get an event from the database for the given server.
Args:
origin: hostname of server which is requesting the event; we
will check that the server is allowed to see it.
event_id: id of the event being requested
Returns:
None if we know nothing about the event; otherwise the (possibly-redacted) event.
Raises:
AuthError if the server is not currently in the room
"""
event = await self.store.get_event(
event_id, allow_none=True, allow_rejected=True
)
if not event:
return None
await self._event_auth_handler.assert_host_in_room(event.room_id, origin)
events = await filter_events_for_server(
self._storage_controllers, origin, self.server_name, [event]
)
event = events[0]
return event
async def on_get_missing_events(
self,
origin: str,
room_id: str,
earliest_events: List[str],
latest_events: List[str],
limit: int,
) -> List[EventBase]:
# We allow partially joined rooms since in this case we are filtering out
# non-local events in `filter_events_for_server`.
await self._event_auth_handler.assert_host_in_room(room_id, origin, True)
# Only allow up to 20 events to be retrieved per request.
limit = min(limit, 20)
missing_events = await self.store.get_missing_events(
room_id=room_id,
earliest_events=earliest_events,
latest_events=latest_events,
limit=limit,
)
missing_events = await filter_events_for_server(
self._storage_controllers, origin, self.server_name, missing_events
)
return missing_events
async def exchange_third_party_invite(
self, sender_user_id: str, target_user_id: str, room_id: str, signed: JsonDict
) -> None:
third_party_invite = {"signed": signed}
event_dict = {
"type": EventTypes.Member,
"content": {
"membership": Membership.INVITE,
"third_party_invite": third_party_invite,
},
"room_id": room_id,
"sender": sender_user_id,
"state_key": target_user_id,
}
if await self._event_auth_handler.is_host_in_room(room_id, self.hs.hostname):
room_version_obj = await self.store.get_room_version(room_id)
builder = self.event_builder_factory.for_room_version(
room_version_obj, event_dict
)
EventValidator().validate_builder(builder)
# Try several times, it could fail with PartialStateConflictError
# in send_membership_event, cf comment in except block.
max_retries = 5
for i in range(max_retries):
try:
(
event,
context,
) = await self.event_creation_handler.create_new_client_event(
builder=builder
)
event, context = await self.add_display_name_to_third_party_invite(
room_version_obj, event_dict, event, context
)
EventValidator().validate_new(event, self.config)
# We need to tell the transaction queue to send this out, even
# though the sender isn't a local user.
event.internal_metadata.send_on_behalf_of = self.hs.hostname
try:
validate_event_for_room_version(event)
await self._event_auth_handler.check_auth_rules_from_context(
event
)
except AuthError as e:
logger.warning(
"Denying new third party invite %r because %s", event, e
)
raise e
await self._check_signature(event, context)
# We retrieve the room member handler here as to not cause a cyclic dependency
member_handler = self.hs.get_room_member_handler()
await member_handler.send_membership_event(None, event, context)
break
except PartialStateConflictError as e:
# Persisting couldn't happen because the room got un-partial stated
# in the meantime and context needs to be recomputed, so let's do so.
if i == max_retries - 1:
raise e
pass
else:
destinations = {x.split(":", 1)[-1] for x in (sender_user_id, room_id)}
try:
await self.federation_client.forward_third_party_invite(
destinations, room_id, event_dict
)
except (RequestSendFailed, HttpResponseException):
raise SynapseError(502, "Failed to forward third party invite")
async def on_exchange_third_party_invite_request(
self, event_dict: JsonDict
) -> None:
"""Handle an exchange_third_party_invite request from a remote server
The remote server will call this when it wants to turn a 3pid invite
into a normal m.room.member invite.
Args:
event_dict: Dictionary containing the event body.
"""
assert_params_in_dict(event_dict, ["room_id"])
room_version_obj = await self.store.get_room_version(event_dict["room_id"])
# NB: event_dict has a particular specced format we might need to fudge
# if we change event formats too much.
builder = self.event_builder_factory.for_room_version(
room_version_obj, event_dict
)
# Try several times, it could fail with PartialStateConflictError
# in send_membership_event, cf comment in except block.
max_retries = 5
for i in range(max_retries):
try:
(
event,
context,
) = await self.event_creation_handler.create_new_client_event(
builder=builder
)
event, context = await self.add_display_name_to_third_party_invite(
room_version_obj, event_dict, event, context
)
try:
validate_event_for_room_version(event)
await self._event_auth_handler.check_auth_rules_from_context(event)
except AuthError as e:
logger.warning("Denying third party invite %r because %s", event, e)
raise e
await self._check_signature(event, context)
# We need to tell the transaction queue to send this out, even
# though the sender isn't a local user.
event.internal_metadata.send_on_behalf_of = get_domain_from_id(
event.sender
)
# We retrieve the room member handler here as to not cause a cyclic dependency
member_handler = self.hs.get_room_member_handler()
await member_handler.send_membership_event(None, event, context)
break
except PartialStateConflictError as e:
# Persisting couldn't happen because the room got un-partial stated
# in the meantime and context needs to be recomputed, so let's do so.
if i == max_retries - 1:
raise e
pass
async def add_display_name_to_third_party_invite(
self,
room_version_obj: RoomVersion,
event_dict: JsonDict,
event: EventBase,
context: EventContext,
) -> Tuple[EventBase, EventContext]:
key = (
EventTypes.ThirdPartyInvite,
event.content["third_party_invite"]["signed"]["token"],
)
original_invite = None
prev_state_ids = await context.get_prev_state_ids(
StateFilter.from_types([(EventTypes.ThirdPartyInvite, None)])
)
original_invite_id = prev_state_ids.get(key)
if original_invite_id:
original_invite = await self.store.get_event(
original_invite_id, allow_none=True
)
if original_invite:
# If the m.room.third_party_invite event's content is empty, it means the
# invite has been revoked. In this case, we don't have to raise an error here
# because the auth check will fail on the invite (because it's not able to
# fetch public keys from the m.room.third_party_invite event's content, which
# is empty).
display_name = original_invite.content.get("display_name")
event_dict["content"]["third_party_invite"]["display_name"] = display_name
else:
logger.info(
"Could not find invite event for third_party_invite: %r", event_dict
)
# We don't discard here as this is not the appropriate place to do
# auth checks. If we need the invite and don't have it then the
# auth check code will explode appropriately.
builder = self.event_builder_factory.for_room_version(
room_version_obj, event_dict
)
EventValidator().validate_builder(builder)
event, context = await self.event_creation_handler.create_new_client_event(
builder=builder
)
EventValidator().validate_new(event, self.config)
return event, context
async def _check_signature(self, event: EventBase, context: EventContext) -> None:
"""
Checks that the signature in the event is consistent with its invite.
Args:
event: The m.room.member event to check
context:
Raises:
AuthError: if signature didn't match any keys, or key has been
revoked,
SynapseError: if a transient error meant a key couldn't be checked
for revocation.
"""
signed = event.content["third_party_invite"]["signed"]
token = signed["token"]
prev_state_ids = await context.get_prev_state_ids(
StateFilter.from_types([(EventTypes.ThirdPartyInvite, None)])
)
invite_event_id = prev_state_ids.get((EventTypes.ThirdPartyInvite, token))
invite_event = None
if invite_event_id:
invite_event = await self.store.get_event(invite_event_id, allow_none=True)
if not invite_event:
raise AuthError(403, "Could not find invite")
logger.debug("Checking auth on event %r", event.content)
last_exception: Optional[Exception] = None
# for each public key in the 3pid invite event
for public_key_object in event_auth.get_public_keys(invite_event):
try:
# for each sig on the third_party_invite block of the actual invite
for server, signature_block in signed["signatures"].items():
for key_name in signature_block.keys():
if not key_name.startswith("ed25519:"):
continue
logger.debug(
"Attempting to verify sig with key %s from %r "
"against pubkey %r",
key_name,
server,
public_key_object,
)
try:
public_key = public_key_object["public_key"]
verify_key = decode_verify_key_bytes(
key_name, decode_base64(public_key)
)
verify_signed_json(signed, server, verify_key)
logger.debug(
"Successfully verified sig with key %s from %r "
"against pubkey %r",
key_name,
server,
public_key_object,
)
except Exception:
logger.info(
"Failed to verify sig with key %s from %r "
"against pubkey %r",
key_name,
server,
public_key_object,
)
raise
try:
if "key_validity_url" in public_key_object:
await self._check_key_revocation(
public_key, public_key_object["key_validity_url"]
)
except Exception:
logger.info(
"Failed to query key_validity_url %s",
public_key_object["key_validity_url"],
)
raise
return
except Exception as e:
last_exception = e
if last_exception is None:
# we can only get here if get_public_keys() returned an empty list
# TODO: make this better
raise RuntimeError("no public key in invite event")
raise last_exception
async def _check_key_revocation(self, public_key: str, url: str) -> None:
"""
Checks whether public_key has been revoked.
Args:
public_key: base-64 encoded public key.
url: Key revocation URL.
Raises:
AuthError: if they key has been revoked.
SynapseError: if a transient error meant a key couldn't be checked
for revocation.
"""
try:
response = await self.http_client.get_json(url, {"public_key": public_key})
except Exception:
raise SynapseError(502, "Third party certificate could not be checked")
if "valid" not in response or not response["valid"]:
raise AuthError(403, "Third party certificate was invalid")
async def _clean_room_for_join(self, room_id: str) -> None:
"""Called to clean up any data in DB for a given room, ready for the
server to join the room.
Args:
room_id
"""
if self.config.worker.worker_app:
await self._clean_room_for_join_client(room_id)
else:
await self.store.clean_room_for_join(room_id)
async def get_room_complexity(
self, remote_room_hosts: List[str], room_id: str
) -> Optional[dict]:
"""
Fetch the complexity of a remote room over federation.
Args:
remote_room_hosts: The remote servers to ask.
room_id: The room ID to ask about.
Returns:
Dict contains the complexity
metric versions, while None means we could not fetch the complexity.
"""
for host in remote_room_hosts:
res = await self.federation_client.get_room_complexity(host, room_id)
# We got a result, return it.
if res:
return res
# We fell off the bottom, couldn't get the complexity from anyone. Oh
# well.
return None
async def _resume_partial_state_room_sync(self) -> None:
"""Resumes resyncing of all partial-state rooms after a restart."""
assert not self.config.worker.worker_app
partial_state_rooms = await self.store.get_partial_state_room_resync_info()
for room_id, resync_info in partial_state_rooms.items():
self._start_partial_state_room_sync(
initial_destination=resync_info.joined_via,
other_destinations=resync_info.servers_in_room,
room_id=room_id,
)
def _start_partial_state_room_sync(
self,
initial_destination: Optional[str],
other_destinations: AbstractSet[str],
room_id: str,
) -> None:
"""Starts the background process to resync the state of a partial state room,
if it is not already running.
Args:
initial_destination: the initial homeserver to pull the state from
other_destinations: other homeservers to try to pull the state from, if
`initial_destination` is unavailable
room_id: room to be resynced
"""
async def _sync_partial_state_room_wrapper() -> None:
if room_id in self._active_partial_state_syncs:
# Another local user has joined the room while there is already a
# partial state sync running. This implies that there is a new join
# event to un-partial state. We might find ourselves in one of a few
# scenarios:
# 1. There is an existing partial state sync. The partial state sync
# un-partial states the new join event before completing and all is
# well.
# 2. Before the latest join, the homeserver was no longer in the room
# and there is an existing partial state sync from our previous
# membership of the room. The partial state sync may have:
# a) succeeded, but not yet terminated. The room will not be
# un-partial stated again unless we restart the partial state
# sync.
# b) failed, because we were no longer in the room and remote
# homeservers were refusing our requests, but not yet
# terminated. After the latest join, remote homeservers may
# start answering our requests again, so we should restart the
# partial state sync.
# In the cases where we would want to restart the partial state sync,
# the room would have the partial state flag when the partial state sync
# terminates.
self._partial_state_syncs_maybe_needing_restart[room_id] = (
initial_destination,
other_destinations,
)
return
self._active_partial_state_syncs.add(room_id)
try:
await self._sync_partial_state_room(
initial_destination=initial_destination,
other_destinations=other_destinations,
room_id=room_id,
)
finally:
# Read the room's partial state flag while we still hold the claim to
# being the active partial state sync (so that another partial state
# sync can't come along and mess with it under us).
# Normally, the partial state flag will be gone. If it isn't, then we
# may find ourselves in scenario 2a or 2b as described in the comment
# above, where we want to restart the partial state sync.
is_still_partial_state_room = await self.store.is_partial_state_room(
room_id
)
self._active_partial_state_syncs.remove(room_id)
if room_id in self._partial_state_syncs_maybe_needing_restart:
(
restart_initial_destination,
restart_other_destinations,
) = self._partial_state_syncs_maybe_needing_restart.pop(room_id)
if is_still_partial_state_room:
self._start_partial_state_room_sync(
initial_destination=restart_initial_destination,
other_destinations=restart_other_destinations,
room_id=room_id,
)
run_as_background_process(
desc="sync_partial_state_room", func=_sync_partial_state_room_wrapper
)
async def _sync_partial_state_room(
self,
initial_destination: Optional[str],
other_destinations: AbstractSet[str],
room_id: str,
) -> None:
"""Background process to resync the state of a partial-state room
Args:
initial_destination: the initial homeserver to pull the state from
other_destinations: other homeservers to try to pull the state from, if
`initial_destination` is unavailable
room_id: room to be resynced
"""
# Assume that we run on the main process for now.
# TODO(faster_joins,multiple workers)
# When moving the sync to workers, we need to ensure that
# * `_start_partial_state_room_sync` still prevents duplicate resyncs
# * `_is_partial_state_room_linearizer` correctly guards partial state flags
# for rooms between the workers doing remote joins and resync.
assert not self.config.worker.worker_app
# TODO(faster_joins): do we need to lock to avoid races? What happens if other
# worker processes kick off a resync in parallel? Perhaps we should just elect
# a single worker to do the resync.
# https://github.com/matrix-org/synapse/issues/12994
#
# TODO(faster_joins): what happens if we leave the room during a resync? if we
# really leave, that might mean we have difficulty getting the room state over
# federation.
# https://github.com/matrix-org/synapse/issues/12802
# Make an infinite iterator of destinations to try. Once we find a working
# destination, we'll stick with it until it flakes.
destinations = _prioritise_destinations_for_partial_state_resync(
initial_destination, other_destinations, room_id
)
destination_iter = itertools.cycle(destinations)
# `destination` is the current remote homeserver we're pulling from.
destination = next(destination_iter)
logger.info("Syncing state for room %s via %s", room_id, destination)
# we work through the queue in order of increasing stream ordering.
while True:
batch = await self.store.get_partial_state_events_batch(room_id)
if not batch:
# all the events are updated, so we can update current state and
# clear the lazy-loading flag.
logger.info("Updating current state for %s", room_id)
# TODO(faster_joins): notify workers in notify_room_un_partial_stated
# https://github.com/matrix-org/synapse/issues/12994
await self.state_handler.update_current_state(room_id)
logger.info("Handling any pending device list updates")
await self._device_handler.handle_room_un_partial_stated(room_id)
async with self._is_partial_state_room_linearizer.queue(room_id):
logger.info("Clearing partial-state flag for %s", room_id)
new_stream_id = await self.store.clear_partial_state_room(room_id)
if new_stream_id is not None:
logger.info("State resync complete for %s", room_id)
self._storage_controllers.state.notify_room_un_partial_stated(
room_id
)
await self._notifier.on_un_partial_stated_room(
room_id, new_stream_id
)
return
# we raced against more events arriving with partial state. Go round
# the loop again. We've already logged a warning, so no need for more.
continue
events = await self.store.get_events_as_list(
batch,
redact_behaviour=EventRedactBehaviour.as_is,
allow_rejected=True,
)
for event in events:
for attempt in itertools.count():
try:
await self._federation_event_handler.update_state_for_partial_state_event(
destination, event
)
break
except FederationPullAttemptBackoffError as exc:
# Log a warning about why we failed to process the event (the error message
# for `FederationPullAttemptBackoffError` is pretty good)
logger.warning("_sync_partial_state_room: %s", exc)
# We do not record a failed pull attempt when we backoff fetching a missing
# `prev_event` because not being able to fetch the `prev_events` just means
# we won't be able to de-outlier the pulled event. But we can still use an
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e:
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700
if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do
# if there's a temporary network outage. retrying
# indefinitely is also not the right thing to do if we can
# reach all homeservers and they all claim they don't have
# the state we want.
# https://github.com/matrix-org/synapse/issues/13000
logger.error(
"Failed to get state for %s at %s from %s because %s, "
"giving up!",
room_id,
event,
destination,
e,
)
raise
# Try the next remote server.
logger.info(
"Failed to get state for %s at %s from %s because %s",
room_id,
event,
destination,
e,
)
destination = next(destination_iter)
logger.info(
"Syncing state for room %s via %s instead",
room_id,
destination,
)
def _prioritise_destinations_for_partial_state_resync(
initial_destination: Optional[str],
other_destinations: AbstractSet[str],
room_id: str,
) -> StrCollection:
"""Work out the order in which we should ask servers to resync events.
If an `initial_destination` is given, it takes top priority. Otherwise
all servers are treated equally.
:raises ValueError: if no destination is provided at all.
"""
if initial_destination is None and len(other_destinations) == 0:
raise ValueError(f"Cannot resync state of {room_id}: no destinations provided")
if initial_destination is None:
return other_destinations
# Move `initial_destination` to the front of the list.
destinations = list(other_destinations)
if initial_destination in destinations:
destinations.remove(initial_destination)
destinations = [initial_destination] + destinations
return destinations