matrix-public-archive/server/fetch-events-in-range.js

135 lines
4.4 KiB
JavaScript
Raw Normal View History

2022-02-24 02:27:53 -07:00
'use strict';
const assert = require('assert');
const urlJoin = require('url-join');
const { fetchEndpointAsJson } = require('./lib/fetch-endpoint');
const { traceFunction } = require('./tracing/trace-utilities');
2022-02-24 02:27:53 -07:00
const config = require('./lib/config');
const matrixServerUrl = config.get('matrixServerUrl');
assert(matrixServerUrl);
// Find an event right ahead of where we are trying to look. Then paginate
// /messages backwards. This makes sure that we can get events for the day
// when the room started.
//
// Consider this scenario: dayStart(fromTs) <---- msg1 <- msg2 <-- msg3 <---- dayEnd(toTs)
// - ❌ If we start from dayStart and look backwards, we will find nothing.
// - ❌ If we start from dayStart and look forwards, we will find msg1, but federated backfill won't be able to paginate forwards
// - ✅ If we start from dayEnd and look backwards, we will find msg3
2022-02-24 02:27:53 -07:00
// - ❌ If we start from dayEnd and look forwards, we will find nothing
//
// Returns events in reverse-chronological order.
async function fetchEventsFromTimestampBackwards(accessToken, roomId, ts, limit) {
assert(accessToken);
assert(roomId);
assert(ts);
assert(limit);
const timestampToEventEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/unstable/org.matrix.msc3030/rooms/${roomId}/timestamp_to_event?ts=${ts}&dir=b`
);
const timestampToEventResData = await fetchEndpointAsJson(timestampToEventEndpoint, {
accessToken,
});
const eventIdForTimestamp = timestampToEventResData.event_id;
assert(eventIdForTimestamp);
//console.log('eventIdForTimestamp', eventIdForTimestamp);
// We only use this endpoint to get a pagination token we can use with
// `/messages`.
//
// We add `limit=0` here because we want to grab the pagination token right
// (before/after) the event.
Fix archive not responding in big rooms because homeserver times out on `/context` request Add `filter={"lazy_load_members":true}` so that `/context` responds without timing out by returning just the state for the sender of the included event. Otherwise, the homeserver returns all state in the room at that point in time which in big rooms, can be 100k member events that we don't care about anyway. Synapse seems to timeout at about the ~5k state event mark. ## Dev notes Without `filter={"lazy_load_members":true}`, Synapse can only handle ~4k member events (probably state in general) before `/context` times out before it ever responds in the 180 second window. I'm only looking at the member count as a rough proxy but the number of member events + state will be more. Loads | Member count | Alias | Matrix.to | API request --- | --- | --- | --- | --- :x: | 39k | `#matrix:matrix.org` | [🔗 ](https://matrix.to/#/!OGEhHVWSdvArJzumhm:matrix.org/$xo-tESRNP1Vg1RxLXIfmMeO6dA6-u9XuE2lv6toeKcw?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!OGEhHVWSdvArJzumhm:matrix.org/context/$xo-tESRNP1Vg1RxLXIfmMeO6dA6-u9XuE2lv6toeKcw?limit=0` :x: | 31k | `#openwisp_general:gitter.im` | [🔗 ](https://matrix.to/#/!RBzfoBeqYcCwLAAenz:gitter.im/$YFjmbmH0NRPmfVsPyxhM0jcK4RFR_CdCigtHSgCTLSc?via=matrix.org) |`https://matrix-client.matrix.org/_matrix/client/r0/rooms/!RBzfoBeqYcCwLAAenz:gitter.im/context/$YFjmbmH0NRPmfVsPyxhM0jcK4RFR_CdCigtHSgCTLSc?limit=0` :x: | 16k | `#raspberrypi:matrix.org` | [🔗 ](https://matrix.to/#/!wOlkWNmgkAZFxbTaqj:matrix.org/$WMh_QauoyW6cjFDuUeNgsnUSgDn7C2XlrUnlijhdmdk?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!wOlkWNmgkAZFxbTaqj:matrix.org/context/$WMh_QauoyW6cjFDuUeNgsnUSgDn7C2XlrUnlijhdmdk?limit=0` Loads with warm cache | 7.2k | `#element-android:matrix.org` | [🔗 ](https://matrix.to/#/!AZozoWghOYSIAfaZjJ:matrix.org/$Qmb8vmaD91PIM6ROfh-2jApPoJgH2Q7NnjiwRdiEPZE?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!AZozoWghOYSIAfaZjJ:matrix.org/context/$Qmb8vmaD91PIM6ROfh-2jApPoJgH2Q7NnjiwRdiEPZE?limit=0` ✅ | 4k | `#nim-science:envs.net` | [🔗 ](https://matrix.to/#/!IpFtPSbgfrZrVcVyti:envs.net/$bDRop4yvjFOl3HMouAyx4mtau-JaAxvBUJ-MqftAm7E?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!IpFtPSbgfrZrVcVyti:envs.net/context/$bDRop4yvjFOl3HMouAyx4mtau-JaAxvBUJ-MqftAm7E?limit=0` ✅ | 2.3k | `#nim-webdev:matrix.org` | [🔗 ](https://matrix.to/#/!EoyccMaVGwdqyfKMAL:matrix.org/$VdRyNb1F-gLdDP9nDYK7xAh_LcE822fEfDYWz5re8dE?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!EoyccMaVGwdqyfKMAL:matrix.org/context/$VdRyNb1F-gLdDP9nDYK7xAh_LcE822fEfDYWz5re8dE?limit=0`
2022-06-29 04:37:40 -06:00
//
// Add `filter={"lazy_load_members":true}` so that this endpoint responds
// without timing out by returning just the state for the sender of the
// included event. Otherwise, the homeserver returns all state in the room at
// that point in time which in big rooms, can be 100k member events that we
Fix archive not responding in big rooms because homeserver times out on `/context` request Add `filter={"lazy_load_members":true}` so that `/context` responds without timing out by returning just the state for the sender of the included event. Otherwise, the homeserver returns all state in the room at that point in time which in big rooms, can be 100k member events that we don't care about anyway. Synapse seems to timeout at about the ~5k state event mark. ## Dev notes Without `filter={"lazy_load_members":true}`, Synapse can only handle ~4k member events (probably state in general) before `/context` times out before it ever responds in the 180 second window. I'm only looking at the member count as a rough proxy but the number of member events + state will be more. Loads | Member count | Alias | Matrix.to | API request --- | --- | --- | --- | --- :x: | 39k | `#matrix:matrix.org` | [🔗 ](https://matrix.to/#/!OGEhHVWSdvArJzumhm:matrix.org/$xo-tESRNP1Vg1RxLXIfmMeO6dA6-u9XuE2lv6toeKcw?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!OGEhHVWSdvArJzumhm:matrix.org/context/$xo-tESRNP1Vg1RxLXIfmMeO6dA6-u9XuE2lv6toeKcw?limit=0` :x: | 31k | `#openwisp_general:gitter.im` | [🔗 ](https://matrix.to/#/!RBzfoBeqYcCwLAAenz:gitter.im/$YFjmbmH0NRPmfVsPyxhM0jcK4RFR_CdCigtHSgCTLSc?via=matrix.org) |`https://matrix-client.matrix.org/_matrix/client/r0/rooms/!RBzfoBeqYcCwLAAenz:gitter.im/context/$YFjmbmH0NRPmfVsPyxhM0jcK4RFR_CdCigtHSgCTLSc?limit=0` :x: | 16k | `#raspberrypi:matrix.org` | [🔗 ](https://matrix.to/#/!wOlkWNmgkAZFxbTaqj:matrix.org/$WMh_QauoyW6cjFDuUeNgsnUSgDn7C2XlrUnlijhdmdk?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!wOlkWNmgkAZFxbTaqj:matrix.org/context/$WMh_QauoyW6cjFDuUeNgsnUSgDn7C2XlrUnlijhdmdk?limit=0` Loads with warm cache | 7.2k | `#element-android:matrix.org` | [🔗 ](https://matrix.to/#/!AZozoWghOYSIAfaZjJ:matrix.org/$Qmb8vmaD91PIM6ROfh-2jApPoJgH2Q7NnjiwRdiEPZE?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!AZozoWghOYSIAfaZjJ:matrix.org/context/$Qmb8vmaD91PIM6ROfh-2jApPoJgH2Q7NnjiwRdiEPZE?limit=0` ✅ | 4k | `#nim-science:envs.net` | [🔗 ](https://matrix.to/#/!IpFtPSbgfrZrVcVyti:envs.net/$bDRop4yvjFOl3HMouAyx4mtau-JaAxvBUJ-MqftAm7E?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!IpFtPSbgfrZrVcVyti:envs.net/context/$bDRop4yvjFOl3HMouAyx4mtau-JaAxvBUJ-MqftAm7E?limit=0` ✅ | 2.3k | `#nim-webdev:matrix.org` | [🔗 ](https://matrix.to/#/!EoyccMaVGwdqyfKMAL:matrix.org/$VdRyNb1F-gLdDP9nDYK7xAh_LcE822fEfDYWz5re8dE?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!EoyccMaVGwdqyfKMAL:matrix.org/context/$VdRyNb1F-gLdDP9nDYK7xAh_LcE822fEfDYWz5re8dE?limit=0`
2022-06-29 04:37:40 -06:00
// don't care about anyway. Synapse seems to timeout at about the ~5k state
// event mark.
2022-02-24 02:27:53 -07:00
const contextEndpoint = urlJoin(
matrixServerUrl,
Fix archive not responding in big rooms because homeserver times out on `/context` request Add `filter={"lazy_load_members":true}` so that `/context` responds without timing out by returning just the state for the sender of the included event. Otherwise, the homeserver returns all state in the room at that point in time which in big rooms, can be 100k member events that we don't care about anyway. Synapse seems to timeout at about the ~5k state event mark. ## Dev notes Without `filter={"lazy_load_members":true}`, Synapse can only handle ~4k member events (probably state in general) before `/context` times out before it ever responds in the 180 second window. I'm only looking at the member count as a rough proxy but the number of member events + state will be more. Loads | Member count | Alias | Matrix.to | API request --- | --- | --- | --- | --- :x: | 39k | `#matrix:matrix.org` | [🔗 ](https://matrix.to/#/!OGEhHVWSdvArJzumhm:matrix.org/$xo-tESRNP1Vg1RxLXIfmMeO6dA6-u9XuE2lv6toeKcw?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!OGEhHVWSdvArJzumhm:matrix.org/context/$xo-tESRNP1Vg1RxLXIfmMeO6dA6-u9XuE2lv6toeKcw?limit=0` :x: | 31k | `#openwisp_general:gitter.im` | [🔗 ](https://matrix.to/#/!RBzfoBeqYcCwLAAenz:gitter.im/$YFjmbmH0NRPmfVsPyxhM0jcK4RFR_CdCigtHSgCTLSc?via=matrix.org) |`https://matrix-client.matrix.org/_matrix/client/r0/rooms/!RBzfoBeqYcCwLAAenz:gitter.im/context/$YFjmbmH0NRPmfVsPyxhM0jcK4RFR_CdCigtHSgCTLSc?limit=0` :x: | 16k | `#raspberrypi:matrix.org` | [🔗 ](https://matrix.to/#/!wOlkWNmgkAZFxbTaqj:matrix.org/$WMh_QauoyW6cjFDuUeNgsnUSgDn7C2XlrUnlijhdmdk?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!wOlkWNmgkAZFxbTaqj:matrix.org/context/$WMh_QauoyW6cjFDuUeNgsnUSgDn7C2XlrUnlijhdmdk?limit=0` Loads with warm cache | 7.2k | `#element-android:matrix.org` | [🔗 ](https://matrix.to/#/!AZozoWghOYSIAfaZjJ:matrix.org/$Qmb8vmaD91PIM6ROfh-2jApPoJgH2Q7NnjiwRdiEPZE?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!AZozoWghOYSIAfaZjJ:matrix.org/context/$Qmb8vmaD91PIM6ROfh-2jApPoJgH2Q7NnjiwRdiEPZE?limit=0` ✅ | 4k | `#nim-science:envs.net` | [🔗 ](https://matrix.to/#/!IpFtPSbgfrZrVcVyti:envs.net/$bDRop4yvjFOl3HMouAyx4mtau-JaAxvBUJ-MqftAm7E?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!IpFtPSbgfrZrVcVyti:envs.net/context/$bDRop4yvjFOl3HMouAyx4mtau-JaAxvBUJ-MqftAm7E?limit=0` ✅ | 2.3k | `#nim-webdev:matrix.org` | [🔗 ](https://matrix.to/#/!EoyccMaVGwdqyfKMAL:matrix.org/$VdRyNb1F-gLdDP9nDYK7xAh_LcE822fEfDYWz5re8dE?via=matrix.org) | `https://matrix-client.matrix.org/_matrix/client/r0/rooms/!EoyccMaVGwdqyfKMAL:matrix.org/context/$VdRyNb1F-gLdDP9nDYK7xAh_LcE822fEfDYWz5re8dE?limit=0`
2022-06-29 04:37:40 -06:00
`_matrix/client/r0/rooms/${roomId}/context/${eventIdForTimestamp}?limit=0&filter={"lazy_load_members":true}`
2022-02-24 02:27:53 -07:00
);
const contextResData = await fetchEndpointAsJson(contextEndpoint, {
accessToken,
});
//console.log('contextResData', contextResData);
// Add `filter={"lazy_load_members":true}` to only get member state events for
// the messages included in the response
2022-02-24 02:27:53 -07:00
const messagesEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/r0/rooms/${roomId}/messages?dir=b&from=${contextResData.end}&limit=${limit}&filter={"lazy_load_members":true}`
2022-02-24 02:27:53 -07:00
);
const messageResData = await fetchEndpointAsJson(messagesEndpoint, {
accessToken,
});
//console.log('messageResData.state', messageResData.state);
const stateEventMap = {};
for (const stateEvent of messageResData.state || []) {
if (stateEvent.type === 'm.room.member') {
stateEventMap[stateEvent.state_key] = stateEvent;
}
}
return {
stateEventMap,
events: messageResData.chunk,
};
}
async function fetchEventsInRange(accessToken, roomId, startTs, endTs, limit) {
assert(accessToken);
assert(roomId);
assert(startTs);
assert(endTs);
assert(limit);
2022-02-24 12:06:19 -07:00
//console.log('fetchEventsInRange', startTs, endTs);
2022-02-24 02:27:53 -07:00
// Fetch events from endTs and before
const { events, stateEventMap } = await fetchEventsFromTimestampBackwards(
accessToken,
roomId,
endTs,
limit
);
2022-02-24 12:06:19 -07:00
//console.log('events', events.length);
2022-02-24 02:27:53 -07:00
let eventsInRange = events;
// `events` are in reverse-chronological order.
// We only need to filter if the oldest message is before startTs
if (events[events.length - 1].origin_server_ts < startTs) {
eventsInRange = [];
// Let's iterate until we see events before startTs
for (let i = 0; i < events.length; i++) {
const event = events[i];
// Once we found an event before startTs, the rest are outside of our range
if (event.origin_server_ts < startTs) {
break;
}
eventsInRange.push(event);
}
}
2022-02-24 12:06:19 -07:00
//console.log('eventsInRange', eventsInRange.length);
2022-02-24 02:27:53 -07:00
const chronologicalEventsInRange = eventsInRange.reverse();
return {
stateEventMap,
events: chronologicalEventsInRange,
};
}
module.exports = traceFunction(fetchEventsInRange);