Reduce DB hits for replication

Some streams will occaisonally advance their positions without actually
having any new rows to send over federation. Currently this means that
the token will not advance on the workers, leading to them repeatedly
sending a slightly out of date token. This in turns requires the master
to hit the DB to check if there are any new rows, rather than hitting
the no op logic where we check if the given token matches the current
token.

This commit changes the API to always return an entry if the position
for a stream has changed, allowing workers to advance their tokens
correctly.
This commit is contained in:
Erik Johnston 2016-09-23 15:31:47 +01:00
parent 667fcd54e8
commit 748d8fdc7b
6 changed files with 115 additions and 45 deletions

8
.gitignore vendored
View File

@ -24,10 +24,10 @@ homeserver*.yaml
.coverage .coverage
htmlcov htmlcov
demo/*.db demo/*/*.db
demo/*.log demo/*/*.log
demo/*.log.* demo/*/*.log.*
demo/*.pid demo/*/*.pid
demo/media_store.* demo/media_store.*
demo/etc demo/etc

View File

@ -197,7 +197,7 @@ class PusherServer(HomeServer):
yield start_pusher(user_id, app_id, pushkey) yield start_pusher(user_id, app_id, pushkey)
stream = results.get("events") stream = results.get("events")
if stream: if stream and stream["rows"]:
min_stream_id = stream["rows"][0][0] min_stream_id = stream["rows"][0][0]
max_stream_id = stream["position"] max_stream_id = stream["position"]
preserve_fn(pusher_pool.on_new_notifications)( preserve_fn(pusher_pool.on_new_notifications)(
@ -205,7 +205,7 @@ class PusherServer(HomeServer):
) )
stream = results.get("receipts") stream = results.get("receipts")
if stream: if stream and stream["rows"]:
rows = stream["rows"] rows = stream["rows"]
affected_room_ids = set(row[1] for row in rows) affected_room_ids = set(row[1] for row in rows)
min_stream_id = rows[0][0] min_stream_id = rows[0][0]

View File

@ -17,6 +17,7 @@ from synapse.http.servlet import parse_integer, parse_string
from synapse.http.server import request_handler, finish_request from synapse.http.server import request_handler, finish_request
from synapse.replication.pusher_resource import PusherResource from synapse.replication.pusher_resource import PusherResource
from synapse.replication.presence_resource import PresenceResource from synapse.replication.presence_resource import PresenceResource
from synapse.api.errors import SynapseError
from twisted.web.resource import Resource from twisted.web.resource import Resource
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
@ -166,7 +167,8 @@ class ReplicationResource(Resource):
def replicate(): def replicate():
return self.replicate(request_streams, limit) return self.replicate(request_streams, limit)
result = yield self.notifier.wait_for_replication(replicate, timeout) writer = yield self.notifier.wait_for_replication(replicate, timeout)
result = writer.finish()
for stream_name, stream_content in result.items(): for stream_name, stream_content in result.items():
logger.info( logger.info(
@ -186,6 +188,9 @@ class ReplicationResource(Resource):
current_token = yield self.current_replication_token() current_token = yield self.current_replication_token()
logger.debug("Replicating up to %r", current_token) logger.debug("Replicating up to %r", current_token)
if limit == 0:
raise SynapseError(400, "Limit cannot be 0")
yield self.account_data(writer, current_token, limit, request_streams) yield self.account_data(writer, current_token, limit, request_streams)
yield self.events(writer, current_token, limit, request_streams) yield self.events(writer, current_token, limit, request_streams)
# TODO: implement limit # TODO: implement limit
@ -200,7 +205,7 @@ class ReplicationResource(Resource):
self.streams(writer, current_token, request_streams) self.streams(writer, current_token, request_streams)
logger.debug("Replicated %d rows", writer.total) logger.debug("Replicated %d rows", writer.total)
defer.returnValue(writer.finish()) defer.returnValue(writer)
def streams(self, writer, current_token, request_streams): def streams(self, writer, current_token, request_streams):
request_token = request_streams.get("streams") request_token = request_streams.get("streams")
@ -233,31 +238,52 @@ class ReplicationResource(Resource):
request_backfill = request_streams.get("backfill") request_backfill = request_streams.get("backfill")
if request_events is not None or request_backfill is not None: if request_events is not None or request_backfill is not None:
if request_events is None: if request_backfill is None:
request_events = current_token.events request_events = current_token.events
if request_backfill is None: if request_backfill is None:
request_backfill = current_token.backfill request_backfill = current_token.backfill
no_new_tokens = (
request_events == current_token.events
and request_backfill == current_token.backfill
)
if no_new_tokens:
return
res = yield self.store.get_all_new_events( res = yield self.store.get_all_new_events(
request_backfill, request_events, request_backfill, request_events,
current_token.backfill, current_token.events, current_token.backfill, current_token.events,
limit limit
) )
writer.write_header_and_rows("events", res.new_forward_events, (
"position", "internal", "json", "state_group" upto_events_token = _position_from_rows(
)) res.new_forward_events, current_token.events
writer.write_header_and_rows("backfill", res.new_backfill_events, ( )
"position", "internal", "json", "state_group"
)) upto_backfill_token = _position_from_rows(
res.new_backfill_events, current_token.backfill
)
if request_events != upto_events_token:
writer.write_header_and_rows("events", res.new_forward_events, (
"position", "internal", "json", "state_group"
), position=upto_events_token)
if request_backfill != upto_backfill_token:
writer.write_header_and_rows("backfill", res.new_backfill_events, (
"position", "internal", "json", "state_group",
), position=upto_backfill_token)
writer.write_header_and_rows( writer.write_header_and_rows(
"forward_ex_outliers", res.forward_ex_outliers, "forward_ex_outliers", res.forward_ex_outliers,
("position", "event_id", "state_group") ("position", "event_id", "state_group"),
) )
writer.write_header_and_rows( writer.write_header_and_rows(
"backward_ex_outliers", res.backward_ex_outliers, "backward_ex_outliers", res.backward_ex_outliers,
("position", "event_id", "state_group") ("position", "event_id", "state_group"),
) )
writer.write_header_and_rows( writer.write_header_and_rows(
"state_resets", res.state_resets, ("position",) "state_resets", res.state_resets, ("position",),
) )
@defer.inlineCallbacks @defer.inlineCallbacks
@ -266,15 +292,16 @@ class ReplicationResource(Resource):
request_presence = request_streams.get("presence") request_presence = request_streams.get("presence")
if request_presence is not None: if request_presence is not None and request_presence != current_position:
presence_rows = yield self.presence_handler.get_all_presence_updates( presence_rows = yield self.presence_handler.get_all_presence_updates(
request_presence, current_position request_presence, current_position
) )
upto_token = _position_from_rows(presence_rows, current_position)
writer.write_header_and_rows("presence", presence_rows, ( writer.write_header_and_rows("presence", presence_rows, (
"position", "user_id", "state", "last_active_ts", "position", "user_id", "state", "last_active_ts",
"last_federation_update_ts", "last_user_sync_ts", "last_federation_update_ts", "last_user_sync_ts",
"status_msg", "currently_active", "status_msg", "currently_active",
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def typing(self, writer, current_token, request_streams): def typing(self, writer, current_token, request_streams):
@ -282,7 +309,7 @@ class ReplicationResource(Resource):
request_typing = request_streams.get("typing") request_typing = request_streams.get("typing")
if request_typing is not None: if request_typing is not None and request_typing != current_position:
# If they have a higher token than current max, we can assume that # If they have a higher token than current max, we can assume that
# they had been talking to a previous instance of the master. Since # they had been talking to a previous instance of the master. Since
# we reset the token on restart, the best (but hacky) thing we can # we reset the token on restart, the best (but hacky) thing we can
@ -293,9 +320,10 @@ class ReplicationResource(Resource):
typing_rows = yield self.typing_handler.get_all_typing_updates( typing_rows = yield self.typing_handler.get_all_typing_updates(
request_typing, current_position request_typing, current_position
) )
upto_token = _position_from_rows(typing_rows, current_position)
writer.write_header_and_rows("typing", typing_rows, ( writer.write_header_and_rows("typing", typing_rows, (
"position", "room_id", "typing" "position", "room_id", "typing"
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def receipts(self, writer, current_token, limit, request_streams): def receipts(self, writer, current_token, limit, request_streams):
@ -303,13 +331,14 @@ class ReplicationResource(Resource):
request_receipts = request_streams.get("receipts") request_receipts = request_streams.get("receipts")
if request_receipts is not None: if request_receipts is not None and request_receipts != current_position:
receipts_rows = yield self.store.get_all_updated_receipts( receipts_rows = yield self.store.get_all_updated_receipts(
request_receipts, current_position, limit request_receipts, current_position, limit
) )
upto_token = _position_from_rows(receipts_rows, current_position)
writer.write_header_and_rows("receipts", receipts_rows, ( writer.write_header_and_rows("receipts", receipts_rows, (
"position", "room_id", "receipt_type", "user_id", "event_id", "data" "position", "room_id", "receipt_type", "user_id", "event_id", "data"
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def account_data(self, writer, current_token, limit, request_streams): def account_data(self, writer, current_token, limit, request_streams):
@ -324,23 +353,36 @@ class ReplicationResource(Resource):
user_account_data = current_position user_account_data = current_position
if room_account_data is None: if room_account_data is None:
room_account_data = current_position room_account_data = current_position
no_new_tokens = (
user_account_data == current_position
and room_account_data == current_position
)
if no_new_tokens:
return
user_rows, room_rows = yield self.store.get_all_updated_account_data( user_rows, room_rows = yield self.store.get_all_updated_account_data(
user_account_data, room_account_data, current_position, limit user_account_data, room_account_data, current_position, limit
) )
upto_users_token = _position_from_rows(user_rows, current_position)
upto_rooms_token = _position_from_rows(room_rows, current_position)
writer.write_header_and_rows("user_account_data", user_rows, ( writer.write_header_and_rows("user_account_data", user_rows, (
"position", "user_id", "type", "content" "position", "user_id", "type", "content"
)) ), position=upto_users_token)
writer.write_header_and_rows("room_account_data", room_rows, ( writer.write_header_and_rows("room_account_data", room_rows, (
"position", "user_id", "room_id", "type", "content" "position", "user_id", "room_id", "type", "content"
)) ), position=upto_rooms_token)
if tag_account_data is not None: if tag_account_data is not None:
tag_rows = yield self.store.get_all_updated_tags( tag_rows = yield self.store.get_all_updated_tags(
tag_account_data, current_position, limit tag_account_data, current_position, limit
) )
upto_tag_token = _position_from_rows(tag_rows, current_position)
writer.write_header_and_rows("tag_account_data", tag_rows, ( writer.write_header_and_rows("tag_account_data", tag_rows, (
"position", "user_id", "room_id", "tags" "position", "user_id", "room_id", "tags"
)) ), position=upto_tag_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def push_rules(self, writer, current_token, limit, request_streams): def push_rules(self, writer, current_token, limit, request_streams):
@ -348,14 +390,15 @@ class ReplicationResource(Resource):
push_rules = request_streams.get("push_rules") push_rules = request_streams.get("push_rules")
if push_rules is not None: if push_rules is not None and push_rules != current_position:
rows = yield self.store.get_all_push_rule_updates( rows = yield self.store.get_all_push_rule_updates(
push_rules, current_position, limit push_rules, current_position, limit
) )
upto_token = _position_from_rows(rows, current_position)
writer.write_header_and_rows("push_rules", rows, ( writer.write_header_and_rows("push_rules", rows, (
"position", "event_stream_ordering", "user_id", "rule_id", "op", "position", "event_stream_ordering", "user_id", "rule_id", "op",
"priority_class", "priority", "conditions", "actions" "priority_class", "priority", "conditions", "actions"
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def pushers(self, writer, current_token, limit, request_streams): def pushers(self, writer, current_token, limit, request_streams):
@ -363,18 +406,19 @@ class ReplicationResource(Resource):
pushers = request_streams.get("pushers") pushers = request_streams.get("pushers")
if pushers is not None: if pushers is not None and pushers != current_position:
updated, deleted = yield self.store.get_all_updated_pushers( updated, deleted = yield self.store.get_all_updated_pushers(
pushers, current_position, limit pushers, current_position, limit
) )
upto_token = _position_from_rows(updated, current_position)
writer.write_header_and_rows("pushers", updated, ( writer.write_header_and_rows("pushers", updated, (
"position", "user_id", "access_token", "profile_tag", "kind", "position", "user_id", "access_token", "profile_tag", "kind",
"app_id", "app_display_name", "device_display_name", "pushkey", "app_id", "app_display_name", "device_display_name", "pushkey",
"ts", "lang", "data" "ts", "lang", "data"
)) ), position=upto_token)
writer.write_header_and_rows("deleted_pushers", deleted, ( writer.write_header_and_rows("deleted_pushers", deleted, (
"position", "user_id", "app_id", "pushkey" "position", "user_id", "app_id", "pushkey"
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def caches(self, writer, current_token, limit, request_streams): def caches(self, writer, current_token, limit, request_streams):
@ -382,13 +426,14 @@ class ReplicationResource(Resource):
caches = request_streams.get("caches") caches = request_streams.get("caches")
if caches is not None: if caches is not None and caches != current_position:
updated_caches = yield self.store.get_all_updated_caches( updated_caches = yield self.store.get_all_updated_caches(
caches, current_position, limit caches, current_position, limit
) )
upto_token = _position_from_rows(updated_caches, current_position)
writer.write_header_and_rows("caches", updated_caches, ( writer.write_header_and_rows("caches", updated_caches, (
"position", "cache_func", "keys", "invalidation_ts" "position", "cache_func", "keys", "invalidation_ts"
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def to_device(self, writer, current_token, limit, request_streams): def to_device(self, writer, current_token, limit, request_streams):
@ -396,13 +441,14 @@ class ReplicationResource(Resource):
to_device = request_streams.get("to_device") to_device = request_streams.get("to_device")
if to_device is not None: if to_device is not None and to_device != current_position:
to_device_rows = yield self.store.get_all_new_device_messages( to_device_rows = yield self.store.get_all_new_device_messages(
to_device, current_position, limit to_device, current_position, limit
) )
upto_token = _position_from_rows(to_device_rows, current_position)
writer.write_header_and_rows("to_device", to_device_rows, ( writer.write_header_and_rows("to_device", to_device_rows, (
"position", "user_id", "device_id", "message_json" "position", "user_id", "device_id", "message_json"
)) ), position=upto_token)
@defer.inlineCallbacks @defer.inlineCallbacks
def public_rooms(self, writer, current_token, limit, request_streams): def public_rooms(self, writer, current_token, limit, request_streams):
@ -410,13 +456,14 @@ class ReplicationResource(Resource):
public_rooms = request_streams.get("public_rooms") public_rooms = request_streams.get("public_rooms")
if public_rooms is not None: if public_rooms is not None and public_rooms != current_position:
public_rooms_rows = yield self.store.get_all_new_public_rooms( public_rooms_rows = yield self.store.get_all_new_public_rooms(
public_rooms, current_position, limit public_rooms, current_position, limit
) )
upto_token = _position_from_rows(public_rooms_rows, current_position)
writer.write_header_and_rows("public_rooms", public_rooms_rows, ( writer.write_header_and_rows("public_rooms", public_rooms_rows, (
"position", "room_id", "visibility" "position", "room_id", "visibility"
)) ), position=upto_token)
class _Writer(object): class _Writer(object):
@ -426,11 +473,11 @@ class _Writer(object):
self.total = 0 self.total = 0
def write_header_and_rows(self, name, rows, fields, position=None): def write_header_and_rows(self, name, rows, fields, position=None):
if not rows:
return
if position is None: if position is None:
position = rows[-1][0] if rows:
position = rows[-1][0]
else:
return
self.streams[name] = { self.streams[name] = {
"position": position if type(position) is int else str(position), "position": position if type(position) is int else str(position),
@ -440,6 +487,9 @@ class _Writer(object):
self.total += len(rows) self.total += len(rows)
def __nonzero__(self):
return bool(self.total)
def finish(self): def finish(self):
return self.streams return self.streams
@ -461,3 +511,20 @@ class _ReplicationToken(collections.namedtuple("_ReplicationToken", (
def __str__(self): def __str__(self):
return "_".join(str(value) for value in self) return "_".join(str(value) for value in self)
def _position_from_rows(rows, current_position):
"""Calculates a position to return for a stream. Ideally we want to return the
position of the last row, as that will be the most correct. However, if there
are no rows we fall back to using the current position to stop us from
repeatedly hitting the storage layer unncessarily thinking there are updates.
(Not all advances of the token correspond to an actual update)
We can't just always return the current position, as we often limit the
number of rows we replicate, and so the stream may lag. The assumption is
that if the storage layer returns no new rows then we are not lagging and
we are at the `current_position`.
"""
if rows:
return rows[-1][0]
return current_position

View File

@ -320,6 +320,9 @@ class RoomStore(SQLBaseStore):
txn.execute(sql, (prev_id, current_id, limit,)) txn.execute(sql, (prev_id, current_id, limit,))
return txn.fetchall() return txn.fetchall()
if prev_id == current_id:
return defer.succeed([])
return self.runInteraction( return self.runInteraction(
"get_all_new_public_rooms", get_all_new_public_rooms "get_all_new_public_rooms", get_all_new_public_rooms
) )

View File

@ -42,7 +42,8 @@ class BaseSlavedStoreTestCase(unittest.TestCase):
@defer.inlineCallbacks @defer.inlineCallbacks
def replicate(self): def replicate(self):
streams = self.slaved_store.stream_positions() streams = self.slaved_store.stream_positions()
result = yield self.replication.replicate(streams, 100) writer = yield self.replication.replicate(streams, 100)
result = writer.finish()
yield self.slaved_store.process_replication(result) yield self.slaved_store.process_replication(result)
@defer.inlineCallbacks @defer.inlineCallbacks

View File

@ -120,7 +120,7 @@ class ReplicationResourceCase(unittest.TestCase):
self.hs.clock.advance_time_msec(1) self.hs.clock.advance_time_msec(1)
code, body = yield get code, body = yield get
self.assertEquals(code, 200) self.assertEquals(code, 200)
self.assertEquals(body, {}) self.assertEquals(body.get("rows", []), [])
test_timeout.__name__ = "test_timeout_%s" % (stream) test_timeout.__name__ = "test_timeout_%s" % (stream)
return test_timeout return test_timeout
@ -195,7 +195,6 @@ class ReplicationResourceCase(unittest.TestCase):
self.assertIn("field_names", stream) self.assertIn("field_names", stream)
field_names = stream["field_names"] field_names = stream["field_names"]
self.assertIn("rows", stream) self.assertIn("rows", stream)
self.assertTrue(stream["rows"])
for row in stream["rows"]: for row in stream["rows"]:
self.assertEquals( self.assertEquals(
len(row), len(field_names), len(row), len(field_names),