matrix-public-archive/server/routes/room-routes.js

970 lines
43 KiB
JavaScript

'use strict';
const assert = require('assert');
const path = require('path');
const urlJoin = require('url-join');
const express = require('express');
const asyncHandler = require('../lib/express-async-handler');
const StatusError = require('../lib/errors/status-error');
const redirectToCorrectArchiveUrlIfBadSigil = require('../middleware/redirect-to-correct-archive-url-if-bad-sigil-middleware');
const identifyRoute = require('../middleware/identify-route-middleware');
const { HTTPResponseError } = require('../lib/fetch-endpoint');
const parseViaServersFromUserInput = require('../lib/parse-via-servers-from-user-input');
const {
fetchRoomData,
fetchPredecessorInfo,
fetchSuccessorInfo,
} = require('../lib/matrix-utils/fetch-room-data');
const fetchEventsFromTimestampBackwards = require('../lib/matrix-utils/fetch-events-from-timestamp-backwards');
const ensureRoomJoined = require('../lib/matrix-utils/ensure-room-joined');
const timestampToEvent = require('../lib/matrix-utils/timestamp-to-event');
const { removeMe_fetchRoomCreateEventId } = require('../lib/matrix-utils/fetch-room-data');
const getMessagesResponseFromEventId = require('../lib/matrix-utils/get-messages-response-from-event-id');
const renderHydrogenVmRenderScriptToPageHtml = require('../hydrogen-render/render-hydrogen-vm-render-script-to-page-html');
const setHeadersToPreloadAssets = require('../lib/set-headers-to-preload-assets');
const setHeadersForDateTemporalContext = require('../lib/set-headers-for-date-temporal-context');
const MatrixPublicArchiveURLCreator = require('matrix-public-archive-shared/lib/url-creator');
const { mxcUrlToHttpThumbnail } = require('matrix-public-archive-shared/lib/mxc-url-to-http');
const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw');
const {
MS_LOOKUP,
TIME_PRECISION_VALUES,
DIRECTION,
VALID_ENTITY_DESCRIPTOR_TO_SIGIL_MAP,
} = require('matrix-public-archive-shared/lib/reference-values');
const { ONE_DAY_IN_MS, ONE_HOUR_IN_MS, ONE_MINUTE_IN_MS, ONE_SECOND_IN_MS } = MS_LOOKUP;
const {
roundUpTimestampToUtcDay,
roundUpTimestampToUtcHour,
roundUpTimestampToUtcMinute,
roundUpTimestampToUtcSecond,
getUtcStartOfDayTs,
getUtcStartOfHourTs,
getUtcStartOfMinuteTs,
getUtcStartOfSecondTs,
areTimestampsFromSameUtcDay,
areTimestampsFromSameUtcHour,
areTimestampsFromSameUtcMinute,
areTimestampsFromSameUtcSecond,
} = require('matrix-public-archive-shared/lib/timestamp-utilities');
const config = require('../lib/config');
const basePath = config.get('basePath');
assert(basePath);
const matrixServerUrl = config.get('matrixServerUrl');
assert(matrixServerUrl);
const matrixAccessToken = config.get('matrixAccessToken');
assert(matrixAccessToken);
const stopSearchEngineIndexing = config.get('stopSearchEngineIndexing');
const matrixPublicArchiveURLCreator = new MatrixPublicArchiveURLCreator(basePath);
const router = express.Router({
caseSensitive: true,
// Preserve the req.params values from the parent router.
mergeParams: true,
});
const validSigilList = Object.values(VALID_ENTITY_DESCRIPTOR_TO_SIGIL_MAP);
const sigilRe = new RegExp(`^(${validSigilList.join('|')})`);
function getErrorStringForTooManyMessages(archiveMessageLimit) {
const message =
`Too many messages were sent all within a second for us to display ` +
`(more than ${archiveMessageLimit} in one second). We're unable to redirect you to ` +
`a smaller time range to view them without losing a few between each page. ` +
`Since this is probably pretty rare, we've decided not to support it for now.`;
return message;
}
function getRoomIdOrAliasFromReq(req) {
const entityDescriptor = req.params.entityDescriptor;
// This could be with or with our without the sigil. Although the correct thing here
// is to have no sigil. We will try to correct it for them in any case.
const roomIdOrAliasDirty = req.params.roomIdOrAliasDirty;
const roomIdOrAliasWithoutSigil = roomIdOrAliasDirty.replace(sigilRe, '');
const sigil = VALID_ENTITY_DESCRIPTOR_TO_SIGIL_MAP[entityDescriptor];
if (!sigil) {
throw new Error(
`Unknown entityDescriptor=${entityDescriptor} has no sigil. This is an error with the Matrix Public Archive itself (please open an issue).`
);
}
return `${sigil}${roomIdOrAliasWithoutSigil}`;
}
// eslint-disable-next-line max-statements, complexity
function parseArchiveRangeFromReq(req) {
const yyyy = parseInt(req.params.yyyy, 10);
// Month is the only zero-based index in this group
const mm = parseInt(req.params.mm, 10) - 1;
const dd = parseInt(req.params.dd, 10);
const timeString = req.params.time;
let timeInMs = 0;
let timeDefined = false;
let secondsDefined = false;
if (timeString) {
const timeMatches = timeString.match(/^T(\d\d?):(\d\d?)(?::(\d\d?))?$/);
if (!timeMatches) {
throw new StatusError(
404,
'Time was unable to be parsed from URL. It should be in 24-hour format 23:59:59'
);
}
const hour = timeMatches[1] && parseInt(timeMatches[1], 10);
const minute = timeMatches[2] && parseInt(timeMatches[2], 10);
const second = timeMatches[3] ? parseInt(timeMatches[3], 10) : 0;
timeDefined = !!timeMatches;
// Whether the timestamp included seconds
secondsDefined = !!timeMatches[3];
if (Number.isNaN(hour) || hour < 0 || hour > 23) {
throw new StatusError(404, `Hour can only be in range 0-23 -> ${hour}`);
}
if (Number.isNaN(minute) || minute < 0 || minute > 59) {
throw new StatusError(404, `Minute can only be in range 0-59 -> ${minute}`);
}
if (Number.isNaN(second) || second < 0 || second > 59) {
throw new StatusError(404, `Second can only be in range 0-59 -> ${second}`);
}
const hourInMs = hour * ONE_HOUR_IN_MS;
const minuteInMs = minute * ONE_MINUTE_IN_MS;
const secondInMs = second * ONE_SECOND_IN_MS;
timeInMs = hourInMs + minuteInMs + secondInMs;
}
let toTimestamp;
if (timeInMs) {
const startOfDayTimestamp = Date.UTC(yyyy, mm, dd);
toTimestamp = startOfDayTimestamp + timeInMs;
}
// If no time specified, then we assume end-of-day
else {
// We `- 1` from UTC midnight to get the timestamp that is a millisecond before the
// next day T23:59:59.999
toTimestamp = Date.UTC(yyyy, mm, dd + 1) - 1;
}
return {
toTimestamp,
yyyy,
mm,
dd,
// Whether the req included time `T23:59`
timeDefined,
// Whether the req included seconds in the time `T23:59:59`
secondsDefined,
};
}
router.use(redirectToCorrectArchiveUrlIfBadSigil);
router.get(
'/',
identifyRoute('app-archive-room-index'),
asyncHandler(async function (req, res) {
const roomIdOrAlias = getRoomIdOrAliasFromReq(req);
// In case we're joining a new room for the first time,
// let's avoid redirecting to our join event by getting
// the time before we join and looking backwards.
const dateBeforeJoin = Date.now();
// We have to wait for the room join to happen first before we can fetch
// any of the additional room info or messages.
const roomId = await ensureRoomJoined(matrixAccessToken, roomIdOrAlias, {
viaServers: parseViaServersFromUserInput(req.query.via),
abortSignal: req.abortSignal,
});
// Find the closest day to the current time with messages
const { originServerTs } = await timestampToEvent({
accessToken: matrixAccessToken,
roomId,
ts: dateBeforeJoin,
direction: DIRECTION.backward,
abortSignal: req.abortSignal,
});
if (!originServerTs) {
throw new StatusError(404, 'Unable to find day with history');
}
// Redirect to a day with messages
res.redirect(
matrixPublicArchiveURLCreator.archiveUrlForDate(roomIdOrAlias, new Date(originServerTs), {
// We can avoid passing along the `via` query parameter because we already
// joined the room above (see `ensureRoomJoined`).
//
//viaServers: parseViaServersFromUserInput(req.query.via),
})
);
})
);
router.get(
'/event/:eventId',
identifyRoute('app-archive-room-event'),
asyncHandler(async function (req, res) {
// TODO: Fetch event to get `origin_server_ts` and redirect to
// /!roomId/2022/01/01?at=$eventId
res.send('todo');
})
);
router.get(
'/jump',
identifyRoute('app-archive-room-jump'),
// eslint-disable-next-line max-statements, complexity
asyncHandler(async function (req, res) {
const roomIdOrAlias = getRoomIdOrAliasFromReq(req);
const currentRangeStartTs = parseInt(req.query.currentRangeStartTs, 10);
assert(
!Number.isNaN(currentRangeStartTs),
'?currentRangeStartTs query parameter must be a number'
);
const currentRangeEndTs = parseInt(req.query.currentRangeEndTs, 10);
assert(!Number.isNaN(currentRangeEndTs), '?currentRangeEndTs query parameter must be a number');
const dir = req.query.dir;
assert(
[DIRECTION.forward, DIRECTION.backward].includes(dir),
'?dir query parameter must be [f|b]'
);
const timelineStartEventId = req.query.timelineStartEventId;
assert(
['string', 'undefined'].includes(typeof timelineStartEventId),
`?timelineStartEventId must be a string or undefined but saw ${typeof timelineStartEventId}`
);
const timelineEndEventId = req.query.timelineEndEventId;
assert(
['string', 'undefined'].includes(typeof timelineStartEventId),
`?timelineEndEventId must be a string or undefined but saw ${typeof timelineStartEventId}`
);
// We have to wait for the room join to happen first before we can use the jump to
// date endpoint (or any other Matrix endpoint)
const viaServers = parseViaServersFromUserInput(req.query.via);
const roomId = await ensureRoomJoined(matrixAccessToken, roomIdOrAlias, {
viaServers,
abortSignal: req.abortSignal,
});
let ts;
let fromCausalEventId;
if (dir === DIRECTION.backward) {
// We `- 1` so we don't jump to the same event because the endpoint is inclusive.
//
// XXX: This is probably an edge-case flaw when there could be multiple events at
// the same timestamp
//
// TODO: Remove the `- 1` when we have the MSC3999 causal event ID support
ts = currentRangeStartTs - 1;
fromCausalEventId = timelineStartEventId;
} else if (dir === DIRECTION.forward) {
// We `+ 1` so we don't jump to the same event because the endpoint is inclusive
//
// XXX: This is probably an edge-case flaw when there could be multiple events at
// the same timestamp
//
// TODO: Remove the `+ 1` when we have the MSC3999 causal event ID support
ts = currentRangeEndTs + 1;
fromCausalEventId = timelineEndEventId;
} else {
throw new StatusError(400, `Unable to handle unknown dir=${dir} in /jump`);
}
let eventIdForClosestEvent;
let tsForClosestEvent;
let newOriginServerTs;
let preferredPrecision = null;
try {
// We pull this fresh from the config for each request to ensure we have an
// updated value between each e2e test
const archiveMessageLimit = config.get('archiveMessageLimit');
let roomCreateEventId;
// Find the closest event to the given timestamp
[{ eventId: eventIdForClosestEvent, originServerTs: tsForClosestEvent }, roomCreateEventId] =
await Promise.all([
timestampToEvent({
accessToken: matrixAccessToken,
roomId,
ts: ts,
direction: dir,
// Since timestamps are untrusted and can be crafted to make loops in the
// timeline. We use this as a signal to keep progressing from this event
// regardless of what timestamp shenanigans are going on. See MSC3999
// (https://github.com/matrix-org/matrix-spec-proposals/pull/3999)
//
// TODO: Add tests for timestamp loops once Synapse supports MSC3999. We
// currently just have this set in case some server has this implemented in
// the future but there currently is no implementation (as of 2023-04-17) and
// we can't have passing tests without a server implementation first.
//
// TODO: This isn't implemented yet
fromCausalEventId,
abortSignal: req.abortSignal,
}),
removeMe_fetchRoomCreateEventId(matrixAccessToken, roomId, {
abortSignal: req.abortSignal,
}),
]);
// Without MSC3999, we currently only detect one kind of loop where the
// `m.room.create` has a timestamp that comes after the timestamp massaged events
// in the room. This is a common pattern for historical Gitter rooms where we
// created the room and then imported a bunch of messages at a time before the
// room was created.
//
// By nature of having an `timelineEndEventId`, we know we are already paginated
// past the `m.room.create` event which is always the first event in the room. So
// we can use that to detect the end of the room before we loop back around to the
// start of the room.
//
// XXX: Once we have MSC3999, we can remove this check in favor of that mechanism
if (
dir === DIRECTION.forward &&
timelineEndEventId &&
eventIdForClosestEvent === roomCreateEventId
) {
throw new StatusError(
404,
`/jump?dir=${dir}: We detected a loop back to the beginning of the room so we can assume ` +
`we hit the end of the room instead of doing a loop. We throw a 404 error here we hit ` +
`the normal 404 no more /messages error handling below`
);
}
// Based on what we found was the closest, figure out the URL that will represent
// the next chunk in the desired direction.
// ==============================
//
// When jumping backwards, since a given room archive URL represents the end of
// the day/time-period looking backward (scroll is also anchored to the bottom),
// we just need to move the user to the time-period just prior the current one.
//
// We are trying to avoid sending the user to the same time period they were just
// viewing. i.e, if they were visiting `/2020/01/02T16:00:00` (displays messages
// backwards from that time up to the limit), which had more messages than we
// could display in that day, jumping backwards from the earliest displayed event
// in the displayed range (say that occured on `T12:00:25`) would still give us
// the same day `/2020/01/02` and we want to redirect them to previous chunk from
// that same day that still encompasses the closest message looking backwards,
// like `/2020/01/02T13:00:00`
if (dir === DIRECTION.backward) {
// We choose `currentRangeStartTs` instead of `ts` (the jump point) because
// TODO: why? and we don't choose `currentRangeEndTs` because TODO: why? - I
// feel like I can't justify this, see
// https://github.com/matrix-org/matrix-public-archive/pull/167#discussion_r1170850432
const fromSameDay =
tsForClosestEvent && areTimestampsFromSameUtcDay(currentRangeStartTs, tsForClosestEvent);
const fromSameHour =
tsForClosestEvent && areTimestampsFromSameUtcHour(currentRangeStartTs, tsForClosestEvent);
const fromSameMinute =
tsForClosestEvent &&
areTimestampsFromSameUtcMinute(currentRangeStartTs, tsForClosestEvent);
const fromSameSecond =
tsForClosestEvent &&
areTimestampsFromSameUtcSecond(currentRangeStartTs, tsForClosestEvent);
// The closest event is from the same second we tried to jump from. Since we
// can't represent something smaller than a second in the URL yet (we could do
// ms but it's a concious choice to make the URL cleaner,
// #support-ms-time-slice), we will need to just return the timestamp with a
// precision of seconds and hope that there isn't too many messages in this same
// second.
//
// XXX: If there is too many messages all within the same second, people will be
// stuck visiting the same page over and over every time they try to jump
// backwards from that range.
if (fromSameSecond) {
newOriginServerTs = tsForClosestEvent;
preferredPrecision = TIME_PRECISION_VALUES.seconds;
}
// The closest event is from the same minute we tried to jump from, we will need
// to round up to the nearest second so that the URL encompasses the closest
// event looking backwards
else if (fromSameMinute) {
newOriginServerTs = roundUpTimestampToUtcSecond(tsForClosestEvent);
preferredPrecision = TIME_PRECISION_VALUES.seconds;
}
// The closest event is from the same hour we tried to jump from, we will need
// to round up to the nearest minute so that the URL encompasses the closest
// event looking backwards
else if (fromSameHour) {
newOriginServerTs = roundUpTimestampToUtcMinute(tsForClosestEvent);
preferredPrecision = TIME_PRECISION_VALUES.minutes;
}
// The closest event is from the same day we tried to jump from, we will need to
// round up to the nearest hour so that the URL encompasses the closest event
// looking backwards
else if (fromSameDay) {
newOriginServerTs = roundUpTimestampToUtcHour(tsForClosestEvent);
preferredPrecision = TIME_PRECISION_VALUES.minutes;
}
// We don't need to do anything. The next closest event is far enough away
// (greater than 1 day) where we don't need to worry about the URL at all and
// can just render whatever day that the closest event is from because the
// archives biggest time-period represented in the URL is a day.
//
// We can display more than a day of content at a given URL (imagine lots of a
// quiet days in a room), but the URL will never represent a time-period
// greater than a day, ex. `/2023/01/01`. We don't allow someone to just
// specify the month like `/2023/01` ❌
else {
newOriginServerTs = tsForClosestEvent;
}
}
// When jumping forwards, the goal is to go forward 100 messages, so that when we
// view the room at that point going backwards 100 messages (which is how the
// archive works for any given date from the archive URL), we end up at the
// perfect continuation spot in the room (seamless).
//
// XXX: This is flawed in the fact that when we go `/messages?dir=b` later, it
// could backfill messages which will fill up the response before we perfectly
// connect and continue from the position they were jumping from before. When
// `/messages?dir=f` backfills (forwards fill), we won't have this problem anymore
// because any messages backfilled in the forwards direction would be picked up
// the same going backwards. See MSC4000
// (https://github.com/matrix-org/matrix-spec-proposals/pull/4000).
else if (dir === DIRECTION.forward) {
// XXX: It would be cool to somehow cache this response and re-use our work here
// for the actual room display that we redirect to from this route. No need for
// us go out 100 messages, only for us to go backwards 100 messages again in the
// next route.
const messageResData = await getMessagesResponseFromEventId({
accessToken: matrixAccessToken,
roomId,
eventId: eventIdForClosestEvent,
dir: DIRECTION.forward,
limit: archiveMessageLimit,
abortSignal: req.abortSignal,
});
if (!messageResData.chunk?.length) {
throw new StatusError(
404,
`/jump?dir=${dir}: /messages response didn't contain any more messages to jump to so we can assume we reached the end of the room.`
);
}
const firstMessage = messageResData.chunk[0];
const tsOfFirstMessage = firstMessage.origin_server_ts;
const lastMessage = messageResData.chunk[messageResData.chunk.length - 1];
const tsOfLastMessage = lastMessage.origin_server_ts;
let msGapFromJumpPointToLastMessage;
// If someone is jumping from `0`, let's assume this is their first time
// navigating in the room and are just trying to get to the first messages in
// the room. Instead of using `0` which give us `moreThanDayGap=true` every time
// (unless someone sent messages in 1970 :P), and round us down to the nearest
// day before any of the messages in the room start, let's just use the start of
// the timeline as the start which will show us a page of content on the first
// try. For the backwards direction, we could have a similar check but with
// `currentRangeStartTs === Infinity` check but it's not necessary since we
// don't have to do any back-tracking extra work.
if (currentRangeEndTs === 0) {
msGapFromJumpPointToLastMessage = tsOfLastMessage - tsOfFirstMessage;
}
// Otherwise do the normal calculation: where we jumped to - where we jumped from
else {
// TODO: Should we use `ts` or `currentRangeStartTs` here?
msGapFromJumpPointToLastMessage = tsOfLastMessage - ts;
}
const moreThanDayGap = msGapFromJumpPointToLastMessage > ONE_DAY_IN_MS;
const moreThanHourGap = msGapFromJumpPointToLastMessage > ONE_HOUR_IN_MS;
const moreThanMinuteGap = msGapFromJumpPointToLastMessage > ONE_MINUTE_IN_MS;
const moreThanSecondGap = msGapFromJumpPointToLastMessage > ONE_SECOND_IN_MS;
// If the first message is on different day than the last message, then we know
// there are messages on days before the last mesage and can safely round to the
// nearest day and still see new content.
//
// We use this information to handle situations where we jump over multiple-day
// gaps with no messages in between. In those cases, we don't want to round down
// to a day where there are no messages in the gap.
const hasMessagesOnDayBeforeDayOfLastMessage = !areTimestampsFromSameUtcDay(
tsOfFirstMessage,
tsOfLastMessage
);
// Back-track from the last message timestamp to the nearest date boundary.
// Because we're back-tracking a couple events here, when we paginate back out
// by the `archiveMessageLimit` later in the room route, it will gurantee some
// overlap with the previous page we jumped from so we don't lose any messages
// in the gap.
//
// We could choose to jump to the exact timestamp of the last message instead of
// back-tracking but then we get ugly URL's every time you jump instead of being
// able to back-track and round down to the nearest hour in a lot of cases. The
// other reason not to return the exact date is maybe there multiple messages at
// the same timestamp and we will lose messages in the gap because it displays
// more than we thought.
//
// If the `/messages` response returns less than the `archiveMessageLimit`
// looking forwards, it means we're looking at the latest events in the room. We
// can simply just display the day that the latest event occured on or the given
// rangeEnd (whichever is later).
const haveReachedLatestMessagesInRoom = messageResData.chunk?.length < archiveMessageLimit;
if (haveReachedLatestMessagesInRoom) {
const latestDesiredTs = Math.max(currentRangeEndTs, tsOfLastMessage);
const latestDesiredDate = new Date(latestDesiredTs);
const utcMidnightTs = getUtcStartOfDayTs(latestDesiredDate);
newOriginServerTs = utcMidnightTs;
preferredPrecision = TIME_PRECISION_VALUES.none;
}
// More than a day gap here, so we can just back-track to the nearest day as
// long as there are messages we haven't seen yet if we visit the nearest day.
else if (moreThanDayGap && hasMessagesOnDayBeforeDayOfLastMessage) {
const utcMidnightOfDayBefore = getUtcStartOfDayTs(tsOfLastMessage);
// We `- 1` from UTC midnight to get the timestamp that is a millisecond
// before the next day but we choose a no time precision so we jump to just
// the bare date without a time. A bare date in the `/date/2022/12/16`
// endpoint represents the end of that day looking backwards so this is
// exactly what we want.
const endOfDayBeforeTs = utcMidnightOfDayBefore - 1;
newOriginServerTs = endOfDayBeforeTs;
preferredPrecision = TIME_PRECISION_VALUES.none;
}
// More than a hour gap here, we will need to back-track to the nearest hour
else if (moreThanHourGap) {
const utcTopOfHourBefore = getUtcStartOfHourTs(tsOfLastMessage);
newOriginServerTs = utcTopOfHourBefore;
preferredPrecision = TIME_PRECISION_VALUES.minutes;
}
// More than a minute gap here, we will need to back-track to the nearest minute
else if (moreThanMinuteGap) {
const utcTopOfMinuteBefore = getUtcStartOfMinuteTs(tsOfLastMessage);
newOriginServerTs = utcTopOfMinuteBefore;
preferredPrecision = TIME_PRECISION_VALUES.minutes;
}
// More than a second gap here, we will need to back-track to the nearest second
else if (moreThanSecondGap) {
const utcTopOfSecondBefore = getUtcStartOfSecondTs(tsOfLastMessage);
newOriginServerTs = utcTopOfSecondBefore;
preferredPrecision = TIME_PRECISION_VALUES.seconds;
}
// Less than a second gap here, we will give up.
//
// XXX: Maybe we can support ms here (#support-ms-time-slice)
else {
// 501 Not Implemented: the server does not support the functionality required
// to fulfill the request
res.status(501);
res.send(
`/jump ran into a problem: ${getErrorStringForTooManyMessages(archiveMessageLimit)}`
);
return;
}
}
} catch (err) {
const is404HTTPResponseError =
err instanceof HTTPResponseError && err.response.status === 404;
const is404StatusError = err instanceof StatusError && err.status === 404;
const is404Error = is404HTTPResponseError || is404StatusError;
// A 404 error just means there is no more messages to paginate in that room and
// we should try to go to the predecessor/successor room appropriately.
if (is404Error) {
if (dir === DIRECTION.backward) {
const {
currentRoomCreationTs,
predecessorRoomId,
predecessorLastKnownEventId,
predecessorViaServers,
} = await fetchPredecessorInfo(matrixAccessToken, roomId, {
abortSignal: req.abortSignal,
});
if (!predecessorRoomId) {
throw new StatusError(
404,
`No predecessor room found for ${roomId} so we can't jump backwards to anywhere (you already reached the end of the room)`
);
}
// We have to join the predecessor room before we can fetch the successor info
// (this could be our first time seeing the room)
await ensureRoomJoined(matrixAccessToken, predecessorRoomId, {
viaServers,
abortSignal: req.abortSignal,
});
const {
successorRoomId: successorRoomIdForPredecessor,
successorSetTs: successorSetTsForPredecessor,
} = await fetchSuccessorInfo(matrixAccessToken, predecessorRoomId, {
abortSignal: req.abortSignal,
});
let tombstoneEventId;
if (!predecessorLastKnownEventId) {
// This is a hack because we can't get the tombstone event ID directly from
// `fetchSuccessorInfo(...)` and the `/state?format=event`
// endpoint, so we have to do this trick. Related to
// https://github.com/matrix-org/synapse/issues/15454
//
// We just assume this is the tombstone event ID but in any case it gets us to
// an event that happened at the same time.
({ eventId: tombstoneEventId } = await timestampToEvent({
accessToken: matrixAccessToken,
roomId: predecessorRoomId,
ts: successorSetTsForPredecessor,
direction: DIRECTION.backward,
abortSignal: req.abortSignal,
}));
}
// Try to continue from the tombstone event in the predecessor room because
// that is the signal that the room admins gave to indicate the end of the
// room in favor of the other regardless of further activity that may have
// occured in the room.
//
// Make sure the the room that the predecessor specifies as the replacement
// room is the same as what the current room is. This is a good signal that
// the rooms are a true continuation of each other and the room admins agree.
let continueAtTsInPredecessorRoom;
if (successorRoomIdForPredecessor === roomId) {
continueAtTsInPredecessorRoom = successorSetTsForPredecessor;
}
// Fallback to the room creation event time if we can't find the predecessor
// room tombstone which will work just fine and as expected for normal room
// upgrade scenarios.
else {
continueAtTsInPredecessorRoom = currentRoomCreationTs;
}
if (
continueAtTsInPredecessorRoom === null ||
continueAtTsInPredecessorRoom === undefined
) {
throw new StatusError(
500,
`You navigated past the end of the room and it has a predecessor set (${predecessorRoomId}) ` +
`but we were unable to find a suitable place to jump to and continue from. ` +
`We could just redirect you to that predecessor room but we decided to throw an error ` +
`instead because we should be able to fallback to the room creation time in any case. ` +
`In other words, there shouldn't be a reason why we can't fetch the \`m.room.create\`` +
`event for this room unless the server is just broken right now. You can try refreshing to try again.`
);
}
// Jump to the predecessor room at the appropriate timestamp to continue from.
// Since we're going backwards, we already know where to go so we can navigate
// straight there.
res.redirect(
matrixPublicArchiveURLCreator.archiveUrlForDate(
predecessorRoomId,
// XXX: We should probably go fetch and use the timestamp from
// `predecessorLastKnownEventId` here but that requires an extra
// `timestampToEvent(...)` lookup. We can assume it's close to the
// tombstone for now.
new Date(continueAtTsInPredecessorRoom),
{
viaServers: Array.from(predecessorViaServers || []),
scrollStartEventId: predecessorLastKnownEventId || tombstoneEventId,
// We can just visit a rough time where the tombstone is as we assume
// it's the last event in the room or at least the last event we care
// about. A given day should be good for most cases but it's possible
// that messages are sent after the tombstone and we end up missing the
// tombstone.
preferredPrecision: TIME_PRECISION_VALUES.none,
}
)
);
return;
} else if (dir === DIRECTION.forward) {
const { successorRoomId } = await fetchSuccessorInfo(matrixAccessToken, roomId, {
abortSignal: req.abortSignal,
});
if (successorRoomId) {
// Jump to the successor room and continue at the first event of the room
res.redirect(
matrixPublicArchiveURLCreator.archiveJumpUrlForRoom(successorRoomId, {
dir: DIRECTION.forward,
currentRangeStartTs: 0,
currentRangeEndTs: 0,
// We don't need to define
// `currentRangeStartEventId`/`currentRangeEndEventId` here because we're
// jumping to a completely new room so the event IDs won't pertain to the
// new room and we don't have any to use anyway.
})
);
return;
}
}
}
// Only throw if it's something other than a 404 error. 404 errors are fine, they
// just mean there is no more messages to paginate in that room and we were
// already viewing the latest in the room.
else {
throw err;
}
}
// If we can't find any more messages to paginate to, just progress the date by a
// day in whatever direction they wanted to go so we can display the empty view for
// that day.
if (!newOriginServerTs) {
let tsAtRangeBoundaryInDirection;
if (dir === DIRECTION.backward) {
tsAtRangeBoundaryInDirection = currentRangeStartTs;
} else if (dir === DIRECTION.forward) {
tsAtRangeBoundaryInDirection = currentRangeEndTs;
}
const dateAtRangeBoundaryInDirection = new Date(tsAtRangeBoundaryInDirection);
const yyyy = dateAtRangeBoundaryInDirection.getUTCFullYear();
const mm = dateAtRangeBoundaryInDirection.getUTCMonth();
const dd = dateAtRangeBoundaryInDirection.getUTCDate();
const newDayDelta = dir === DIRECTION.forward ? 1 : -1;
newOriginServerTs = Date.UTC(yyyy, mm, dd + newDayDelta);
}
// Redirect to a day with messages
const archiveUrlToRedirecTo = matrixPublicArchiveURLCreator.archiveUrlForDate(
roomIdOrAlias,
new Date(newOriginServerTs),
{
// Start the scroll at the next event from where they jumped from (seamless navigation)
scrollStartEventId: eventIdForClosestEvent,
preferredPrecision,
}
);
res.redirect(archiveUrlToRedirecTo);
})
);
// Shows messages from the given date/time looking backwards up to the limit.
router.get(
// The extra set of parenthesis around `((:\\d\\d?)?)` is to work around a
// `path-to-regex` bug where the `?` wasn't attaching to the capture group, see
// https://github.com/pillarjs/path-to-regexp/issues/287
'/date/:yyyy(\\d{4})/:mm(\\d{2})/:dd(\\d{2}):time(T\\d\\d?:\\d\\d?((:\\d\\d?)?))?',
identifyRoute('app-archive-room-date'),
// eslint-disable-next-line max-statements, complexity
asyncHandler(async function (req, res) {
const nowTs = Date.now();
const roomIdOrAlias = getRoomIdOrAliasFromReq(req);
// We pull this fresh from the config for each request to ensure we have an
// updated value between each e2e test
const archiveMessageLimit = config.get('archiveMessageLimit');
assert(archiveMessageLimit);
// Synapse has a max `/messages` limit of 1000
assert(
archiveMessageLimit <= 999,
'archiveMessageLimit needs to be in range [1, 999]. We can only get 1000 messages at a time from Synapse and we need a buffer of at least one to see if there are too many messages on a given day so you can only configure a max of 999. If you need more messages, we will have to implement pagination'
);
const { toTimestamp, yyyy, mm, dd, timeDefined, secondsDefined } =
parseArchiveRangeFromReq(req);
let precisionFromUrl = TIME_PRECISION_VALUES.none;
if (secondsDefined) {
precisionFromUrl = TIME_PRECISION_VALUES.seconds;
} else if (timeDefined) {
precisionFromUrl = TIME_PRECISION_VALUES.minutes;
}
// Just 404 if anyone is trying to view the future, no need to waste resources on
// that
if (toTimestamp > roundUpTimestampToUtcDay(nowTs)) {
throw new StatusError(
404,
`You can't view the history of a room on a future day (${new Date(
toTimestamp
).toISOString()} > ${new Date(nowTs).toISOString()}). Go back`
);
}
// We have to wait for the room join to happen first before we can fetch
// any of the additional room info or messages.
//
// XXX: It would be better if we just tried fetching first and assume that we are
// already joined and only join after we see a 403 Forbidden error (we should do
// this for all places we `ensureRoomJoined`). But we need the `roomId` for use with
// the various Matrix API's anyway and `/join/{roomIdOrAlias}` -> `{ room_id }` is a
// great way to get it (see
// https://github.com/matrix-org/matrix-public-archive/issues/50).
const viaServers = parseViaServersFromUserInput(req.query.via);
const roomId = await ensureRoomJoined(matrixAccessToken, roomIdOrAlias, {
viaServers,
abortSignal: req.abortSignal,
});
// Do these in parallel to avoid the extra time in sequential round-trips
// (we want to display the archive page faster)
const [roomData, { events, stateEventMap }] = await Promise.all([
fetchRoomData(matrixAccessToken, roomId, { abortSignal: req.abortSignal }),
// We over-fetch messages outside of the range of the given day so that we
// can display messages from surrounding days (currently only from days
// before) so that the quiet rooms don't feel as desolate and broken.
//
// When given a bare date like `2022/11/16`, we want to paginate from the end of that
// day backwards. This is why we use the `toTimestamp` here and fetch backwards.
fetchEventsFromTimestampBackwards({
accessToken: matrixAccessToken,
roomId,
ts: toTimestamp,
// We fetch one more than the `archiveMessageLimit` so that we can see if there
// are too many messages from the given day. If we have over the
// `archiveMessageLimit` number of messages fetching from the given day, it's
// acceptable to have them be from surrounding days. But if all 500 messages
// (for example) are from the same day, let's redirect to a smaller hour range
// to display.
limit: archiveMessageLimit + 1,
abortSignal: req.abortSignal,
}),
]);
// Only `world_readable` or `shared` rooms that are `public` are viewable in the archive
const allowedToViewRoom =
roomData.historyVisibility === 'world_readable' ||
(roomData.historyVisibility === 'shared' && roomData.joinRule === 'public');
if (!allowedToViewRoom) {
throw new StatusError(
403,
`Only \`world_readable\` or \`shared\` rooms that are \`public\` can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility} m.room.join_rules=${roomData.joinRule}`
);
}
// Since we're looking backwards from the given day, if we don't see any events,
// then we can assume that it's before the start of the room (it's the only way we
// would see no events).
const hasNavigatedBeforeStartOfRoom = events.length === 0;
// Check if we need to navigate backward to the predecessor room
if (hasNavigatedBeforeStartOfRoom && roomData.predecessorRoomId) {
// Jump to the predecessor room at the date/time the user is trying to visit at
res.redirect(
matrixPublicArchiveURLCreator.archiveUrlForDate(
roomData.predecessorRoomId,
new Date(toTimestamp),
{
preferredPrecision: precisionFromUrl,
// XXX: Should we also try combining `viaServers` we used to get to this room?
viaServers: Array.from(roomData.predecessorViaServers || []),
}
)
);
return;
}
// We only care to navigate to the successor room if we're trying to view something
// past when the successor was set (it's an indicator that we need to go to the new
// room from this time forward).
const isNavigatedPastSuccessor = toTimestamp > roomData.successorSetTs;
// But if we're viewing the day when the successor was set, we want to allow viewing
// the room up until the successor was set.
const newestEvent = events[events.length - 1];
const isNewestEventFromSameDay =
newestEvent &&
newestEvent?.origin_server_ts &&
areTimestampsFromSameUtcDay(toTimestamp, newestEvent?.origin_server_ts);
// Check if we need to navigate forward to the successor room
if (roomData.successorRoomId && isNavigatedPastSuccessor && !isNewestEventFromSameDay) {
// Jump to the successor room at the date/time the user is trying to visit at
res.redirect(
matrixPublicArchiveURLCreator.archiveUrlForDate(
roomData.successorRoomId,
new Date(toTimestamp),
{
preferredPrecision: precisionFromUrl,
// Just try to pass on the `viaServers` the user was using to get to this room
viaServers: Array.from(viaServers || []),
}
)
);
return;
}
// Default to no indexing (safe default)
let shouldIndex = false;
if (stopSearchEngineIndexing) {
shouldIndex = false;
} else {
// Otherwise we only allow search engines to index `world_readable` rooms
shouldIndex = roomData?.historyVisibility === `world_readable`;
}
const isNsfw = checkTextForNsfw(
// We concat the name, topic, etc together to simply do a single check against
// all of the text.
`${roomData.name} --- ${roomData.canonicalAlias} --- ${roomData.topic} `
);
const pageOptions = {
title: `${roomData.name} - Matrix Public Archive`,
description: `View the history of the ${roomData.name} room in the Matrix Public Archive`,
imageUrl:
roomData.avatarUrl &&
mxcUrlToHttpThumbnail({
mxcUrl: roomData.avatarUrl,
homeserverUrl: matrixServerUrl,
size: 256,
}),
blockedBySafeSearch: isNsfw,
entryPoint: 'client/js/entry-client-hydrogen.js',
locationHref: urlJoin(basePath, req.originalUrl),
shouldIndex,
cspNonce: res.locals.cspNonce,
};
const pageHtml = await renderHydrogenVmRenderScriptToPageHtml({
pageOptions,
vmRenderScriptFilePath: path.resolve(__dirname, '../../shared/hydrogen-vm-render-script.js'),
vmRenderContext: {
toTimestamp,
precisionFromUrl,
roomData: {
...roomData,
// The `canonicalAlias` will take precedence over the `roomId` when present so we only
// want to use it if that's what the user originally browsed to. We shouldn't
// try to switch someone over to the room alias if they browsed from the room
// ID or vice versa.
canonicalAlias:
roomIdOrAlias === roomData.canonicalAlias ? roomData.canonicalAlias : undefined,
},
events,
stateEventMap,
shouldIndex,
config: {
basePath,
matrixServerUrl,
archiveMessageLimit,
},
},
abortSignal: req.abortSignal,
});
setHeadersToPreloadAssets(res, pageOptions);
// This is useful for caching purposes so you can heavily cache past content, but
// not present/future.
setHeadersForDateTemporalContext({
res,
nowTs,
comparedToUrlDate: {
yyyy,
mm,
dd,
},
});
res.set('Content-Type', 'text/html');
res.send(pageHtml);
})
);
module.exports = router;