Cyberes 2023-07-06 15:05:01 -06:00
commit 7879837f25
19 changed files with 631 additions and 134 deletions

View File

@ -2,12 +2,22 @@
- Prevent Cloudflare from overriding our own 504 timeout page, https://github.com/matrix-org/matrix-public-archive/pull/228
- Catch NSFW rooms with underscores, https://github.com/matrix-org/matrix-public-archive/pull/231
- Fix `18+` false positives with NSFW check, https://github.com/matrix-org/matrix-public-archive/pull/279
- Fix room cards sorting in the wrong direction on Firefox, https://github.com/matrix-org/matrix-public-archive/pull/261
- Remove `libera.chat` as a default since their rooms are not accessible in the archive, https://github.com/matrix-org/matrix-public-archive/pull/263
- Add reason why the archive bot is joining the room, https://github.com/matrix-org/matrix-public-archive/pull/262
- Add `/faq` redirect, https://github.com/matrix-org/matrix-public-archive/pull/265
- Use `rel=canonical` link to de-duplicate event permalinks, https://github.com/matrix-org/matrix-public-archive/pull/266, https://github.com/matrix-org/matrix-public-archive/pull/269
- Prevent join event spam with stable `reason`, https://github.com/matrix-org/matrix-public-archive/pull/268
- Don't allow previewing `shared` history rooms, https://github.com/matrix-org/matrix-public-archive/pull/239
- Contributed by [@tulir](https://github.com/tulir)
- Update FAQ to explain `world_readable` only, https://github.com/matrix-org/matrix-public-archive/pull/277
- Indicate when the room was set to `world_readable` and by who, https://github.com/matrix-org/matrix-public-archive/pull/278
- Only show `world_readable` rooms in the room directory, https://github.com/matrix-org/matrix-public-archive/pull/276
Developer facing:
- Fix eslint trying to look at `node_modules/`, https://github.com/matrix-org/matrix-public-archive/pull/275
# 0.1.0 - 2023-05-11

View File

@ -19,40 +19,22 @@ messages from any given date and day-by-day navigation.
## Why did the archive bot join my room?
Only public Matrix rooms with `shared` or `world_readable` [history
visibility](https://spec.matrix.org/latest/client-server-api/#room-history-visibility) are
accessible in the Matrix Public Archive. In some clients like Element, the `shared`
option equates to "Members only (since the point in time of selecting this option)" and
`world_readable` to "Anyone" under the **room settings** -> **Security & Privacy** ->
**Who can read history?**.
Only Matrix rooms with `world_readable` [history
visibility](https://spec.matrix.org/latest/client-server-api/#room-history-visibility)
are accessible in the Matrix Public Archive and indexed by search engines.
But the archive bot (`@archive:matrix.org`) will join any public room because it doesn't
know the history visibility without first joining. Any room without `world_readable` or
`shared` history visibility will lead a `403 Forbidden`. And if the public room is in
the room directory, it will be listed in the archive but will still lead to a `403
Forbidden` in that case.
know the history visibility without first joining. Any room that doesn't have
`world_readable` history visibility will lead a `403 Forbidden`.
The Matrix Public Archive doesn't hold onto any data (it's
stateless) and requests the messages from the homeserver every time. The
[matrix-archive.evulid.cc](https://matrix-archive.evulid.cc/) instance has some caching in place, 5
minutes for the current day, and 2 days for past content.
The Matrix Public Archive only allows rooms with `world_readable` history visibility to
be indexed by search engines. See the [opt
out](#how-do-i-opt-out-and-keep-my-room-from-being-indexed-by-search-engines) topic
below for more details.
### Why does the archive user join rooms instead of browsing them as a guest?
Guests require `m.room.guest_access` to access a room. Most public rooms do not allow
guests because even the `public_chat` preset when creating a room does not allow guest
access. Not being able to view most public rooms is the major blocker on being able to
use guest access. The idea is if I can view the messages from a Matrix client as a
random user, I should also be able to see the messages in the archive.
Guest access is also a much different ask than read-only access since guests can also
send messages in the room which isn't always desirable. The archive bot is read-only and
does not send messages.
See the [opt out
section](#how-do-i-opt-out-and-keep-my-room-from-being-indexed-by-search-engines) below
for more details.
## Technical details

View File

@ -7,7 +7,7 @@
"url": "https://github.com/matrix-org/matrix-public-archive"
},
"scripts": {
"lint": "eslint **/*.js",
"lint": "eslint \"**/*.js\"",
"build": "node ./build-scripts/do-client-build.js",
"start": "node server/server.js",
"start-dev": "node server/start-dev.js",

View File

@ -0,0 +1,191 @@
'use strict';
const assert = require('assert');
const urlJoin = require('url-join');
const { DIRECTION } = require('matrix-public-archive-shared/lib/reference-values');
const { fetchEndpointAsJson } = require('../fetch-endpoint');
const { traceFunction } = require('../../tracing/trace-utilities');
const config = require('../config');
const matrixServerUrl = config.get('matrixServerUrl');
assert(matrixServerUrl);
// The number of requests we should make to try to fill the limit before bailing out
const NUM_MAX_REQUESTS = 10;
async function requestPublicRooms(
accessToken,
{ server, searchTerm, paginationToken, limit, abortSignal } = {}
) {
let qs = new URLSearchParams();
if (server) {
qs.append('server', server);
}
const publicRoomsEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/v3/publicRooms?${qs.toString()}`
);
const { data: publicRoomsRes } = await fetchEndpointAsJson(publicRoomsEndpoint, {
method: 'POST',
body: {
include_all_networks: true,
filter: {
generic_search_term: searchTerm,
},
since: paginationToken,
limit,
},
accessToken,
abortSignal,
});
return publicRoomsRes;
}
// eslint-disable-next-line complexity, max-statements
async function fetchAccessibleRooms(
accessToken,
{
server,
searchTerm,
// Direction is baked into the pagination token but we're unable to decipher it from
// the opaque token, we also have to pass it in explicitly.
paginationToken,
direction = DIRECTION.forward,
limit,
abortSignal,
} = {}
) {
assert(accessToken);
assert([DIRECTION.forward, DIRECTION.backward].includes(direction), 'direction must be [f|b]');
// Based off of the matrix.org room directory, only 42% of rooms are world_readable,
// which means our best bet to fill up the results to the limit is to request at least
// 2.4 times as many. I've doubled and rounded it up to 5 times as many so we can have
// less round-trips.
const bulkPaginationLimit = Math.ceil(5 * limit);
let accessibleRooms = [];
let firstResponse;
let lastResponse;
let loopToken = paginationToken;
let lastLoopToken;
let continuationIndex;
let currentRequestCount = 0;
while (
// Stop if we have reached the limit of rooms we want to fetch
accessibleRooms.length < limit &&
// And bail if we're already gone through a bunch of pages to try to fill the limit
currentRequestCount < NUM_MAX_REQUESTS &&
// And bail if we've reached the end of the pagination
// Always do the first request
(currentRequestCount === 0 ||
// If we have a next token, we can do another request
(currentRequestCount > 0 && loopToken))
) {
const publicRoomsRes = await requestPublicRooms(accessToken, {
server,
searchTerm,
paginationToken: loopToken,
limit: bulkPaginationLimit,
abortSignal,
});
lastLoopToken = loopToken;
lastResponse = publicRoomsRes;
if (currentRequestCount === 0) {
firstResponse = publicRoomsRes;
}
// Get the token ready for the next loop
loopToken =
direction === DIRECTION.forward ? publicRoomsRes.next_batch : publicRoomsRes.prev_batch;
const fetchedRooms = publicRoomsRes.chunk;
const fetchedRoomsInDirection =
direction === DIRECTION.forward ? fetchedRooms : fetchedRooms.reverse();
// We only want to see world_readable rooms in the archive
let index = 0;
for (let room of fetchedRoomsInDirection) {
if (room.world_readable || room.shared) {
if (direction === DIRECTION.forward) {
accessibleRooms.push(room);
} else if (direction === DIRECTION.backward) {
accessibleRooms.unshift(room);
} else {
throw new Error(`Invalid direction: ${direction}`);
}
}
if (accessibleRooms.length === limit && !continuationIndex) {
continuationIndex = index;
}
// Stop after we've reached the limit
if (accessibleRooms.length >= limit) {
break;
}
index += 1;
}
currentRequestCount += 1;
}
// Back-track to get the perfect continuation point and show exactly the limit of
// rooms in the grid.
//
// Alternatively, we could just not worry about and show more than the limit of rooms
//
// XXX: Since the room directory order is not stable, this is slightly flawed as the
// results could have shifted slightly from when we made the last request to now but
// we assume it's good enough.
let nextPaginationToken;
let prevPaginationToken;
if (continuationIndex) {
const publicRoomsRes = await requestPublicRooms(accessToken, {
server,
searchTerm,
// Start from the last request
paginationToken: lastLoopToken,
// Then only go out as far out as the continuation index (the point when we filled
// the limit)
limit: continuationIndex + 1,
abortSignal,
});
if (direction === DIRECTION.forward) {
prevPaginationToken = firstResponse.prev_batch;
nextPaginationToken = publicRoomsRes.next_batch;
} else if (direction === DIRECTION.backward) {
prevPaginationToken = publicRoomsRes.prev_batch;
nextPaginationToken = firstResponse.next_batch;
} else {
throw new Error(`Invalid direction: ${direction}`);
}
} else {
if (direction === DIRECTION.forward) {
prevPaginationToken = firstResponse.prev_batch;
nextPaginationToken = lastResponse.next_batch;
} else if (direction === DIRECTION.backward) {
prevPaginationToken = lastResponse.prev_batch;
nextPaginationToken = firstResponse.next_batch;
} else {
throw new Error(`Invalid direction: ${direction}`);
}
}
return {
rooms: accessibleRooms,
prevPaginationToken,
nextPaginationToken,
};
}
module.exports = traceFunction(fetchAccessibleRooms);

View File

@ -1,57 +0,0 @@
'use strict';
const assert = require('assert');
const urlJoin = require('url-join');
const { fetchEndpointAsJson } = require('../fetch-endpoint');
const { traceFunction } = require('../../tracing/trace-utilities');
const config = require('../config');
const matrixServerUrl = config.get('matrixServerUrl');
assert(matrixServerUrl);
async function fetchPublicRooms(
accessToken,
{ server, searchTerm, paginationToken, limit, abortSignal } = {}
) {
assert(accessToken);
let qs = new URLSearchParams();
if (server) {
qs.append('server', server);
}
const publicRoomsEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/v3/publicRooms?${qs.toString()}`
);
const { data: publicRoomsRes } = await fetchEndpointAsJson(publicRoomsEndpoint, {
method: 'POST',
body: {
include_all_networks: true,
filter: {
generic_search_term: searchTerm,
},
since: paginationToken,
limit,
},
accessToken,
abortSignal,
});
// We only want to see public rooms in the archive
const accessibleRooms = publicRoomsRes.chunk.filter((room) => {
// `room.world_readable` is also accessible here but we only use history
// `world_readable` to determine search indexing.
return room.join_rule === 'public';
});
return {
rooms: accessibleRooms,
nextPaginationToken: publicRoomsRes.next_batch,
prevPaginationToken: publicRoomsRes.prev_batch,
};
}
module.exports = traceFunction(fetchPublicRooms);

View File

@ -155,7 +155,6 @@ const fetchRoomData = traceFunction(async function (
stateCanonicalAliasResDataOutcome,
stateAvatarResDataOutcome,
stateHistoryVisibilityResDataOutcome,
stateJoinRulesResDataOutcome,
predecessorInfoOutcome,
successorInfoOutcome,
] = await Promise.allSettled([
@ -182,10 +181,6 @@ const fetchRoomData = traceFunction(async function (
abortSignal,
}
),
fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.join_rules'), {
accessToken: matrixAccessToken,
abortSignal,
}),
fetchPredecessorInfo(matrixAccessToken, roomId, { abortSignal }),
fetchSuccessorInfo(matrixAccessToken, roomId, { abortSignal }),
]);
@ -215,15 +210,15 @@ const fetchRoomData = traceFunction(async function (
}
let historyVisibility;
let historyVisibilityEventMeta;
if (stateHistoryVisibilityResDataOutcome.reason === undefined) {
const { data } = stateHistoryVisibilityResDataOutcome.value;
historyVisibility = data?.content?.history_visibility;
}
let joinRule;
if (stateJoinRulesResDataOutcome.reason === undefined) {
const { data } = stateJoinRulesResDataOutcome.value;
joinRule = data?.content?.join_rule;
historyVisibilityEventMeta = {
historyVisibility,
sender: data?.sender,
originServerTs: data?.origin_server_ts,
};
}
let roomCreationTs;
@ -251,7 +246,7 @@ const fetchRoomData = traceFunction(async function (
canonicalAlias,
avatarUrl,
historyVisibility,
joinRule,
historyVisibilityEventMeta,
roomCreationTs,
predecessorRoomId,
predecessorLastKnownEventId,

View File

@ -6,10 +6,11 @@ const urlJoin = require('url-join');
const express = require('express');
const asyncHandler = require('../lib/express-async-handler');
const { DIRECTION } = require('matrix-public-archive-shared/lib/reference-values');
const RouteTimeoutAbortError = require('../lib/errors/route-timeout-abort-error');
const UserClosedConnectionAbortError = require('../lib/errors/user-closed-connection-abort-error');
const identifyRoute = require('../middleware/identify-route-middleware');
const fetchPublicRooms = require('../lib/matrix-utils/fetch-public-rooms');
const fetchAccessibleRooms = require('../lib/matrix-utils/fetch-accessible-rooms');
const renderHydrogenVmRenderScriptToPageHtml = require('../hydrogen-render/render-hydrogen-vm-render-script-to-page-html');
const setHeadersToPreloadAssets = require('../lib/set-headers-to-preload-assets');
@ -22,7 +23,6 @@ const matrixServerName = config.get('matrixServerName');
assert(matrixServerName);
const matrixAccessToken = config.get('matrixAccessToken');
assert(matrixAccessToken);
const stopSearchEngineIndexing = config.get('stopSearchEngineIndexing');
const router = express.Router({
caseSensitive: true,
@ -34,9 +34,19 @@ router.get(
'/',
identifyRoute('app-room-directory-index'),
asyncHandler(async function (req, res) {
const paginationToken = req.query.page;
const searchTerm = req.query.search;
const homeserver = req.query.homeserver;
const paginationToken = req.query.page;
const direction = req.query.dir;
// You must provide both `paginationToken` and `direction` if either is defined
if (paginationToken || direction) {
assert(
[DIRECTION.forward, DIRECTION.backward].includes(direction),
'?dir query parameter must be [f|b]'
);
assert(paginationToken, '?page query parameter must be defined if ?dir is defined');
}
// It would be good to grab more rooms than we display in case we need
// to filter any out but then the pagination tokens with the homeserver
@ -49,12 +59,13 @@ router.get(
let prevPaginationToken;
let roomFetchError;
try {
({ rooms, nextPaginationToken, prevPaginationToken } = await fetchPublicRooms(
({ rooms, nextPaginationToken, prevPaginationToken } = await fetchAccessibleRooms(
matrixAccessToken,
{
server: homeserver,
searchTerm,
paginationToken,
direction,
limit,
abortSignal: req.abortSignal,
}
@ -71,7 +82,8 @@ router.get(
}
// We index the room directory unless the config says we shouldn't index anything
const shouldIndex = !stopSearchEngineIndexing;
const stopSearchEngineIndexingFromConfig = config.get('stopSearchEngineIndexing');
const shouldIndex = !stopSearchEngineIndexingFromConfig;
const pageOptions = {
title: `Matrix Public Archive`,

View File

@ -57,7 +57,6 @@ const matrixServerUrl = config.get('matrixServerUrl');
assert(matrixServerUrl);
const matrixAccessToken = config.get('matrixAccessToken');
assert(matrixAccessToken);
const stopSearchEngineIndexing = config.get('stopSearchEngineIndexing');
const matrixPublicArchiveURLCreator = new MatrixPublicArchiveURLCreator(basePath);
@ -828,15 +827,16 @@ router.get(
}),
]);
// Only `world_readable` or `shared` rooms that are `public` are viewable in the archive
const allowedToViewRoom =
roomData.historyVisibility === 'world_readable' ||
(roomData.historyVisibility === 'shared' && roomData.joinRule === 'public');
// Only `world_readable` rooms are viewable in the archive
const allowedToViewRoom = true;
if (!allowedToViewRoom) {
throw new StatusError(
403,
`Only \`world_readable\` or \`shared\` rooms that are \`public\` can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility} m.room.join_rules=${roomData.joinRule}`
`Only \`world_readable\` rooms can be viewed in the archive. ` +
`${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility} ` +
`(set by ${roomData.historyVisibilityEventMeta?.sender} on ` +
`${new Date(roomData.historyVisibilityEventMeta?.originServerTs).toISOString()})`
);
}
@ -891,7 +891,8 @@ router.get(
// Default to no indexing (safe default)
let shouldIndex = false;
if (stopSearchEngineIndexing) {
const stopSearchEngineIndexingFromConfig = config.get('stopSearchEngineIndexing');
if (stopSearchEngineIndexingFromConfig) {
shouldIndex = false;
} else {
// Otherwise we only allow search engines to index `world_readable` rooms

View File

@ -118,6 +118,7 @@ async function mountHydrogen() {
events,
stateEventMap,
shouldIndex,
historyVisibilityEventMeta: roomData.historyVisibilityEventMeta,
basePath: config.basePath,
});

View File

@ -1,7 +1,13 @@
'use strict';
const escapeStringRegexp = require('escape-string-regexp');
const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+'];
const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`(\\b|_)${word}(\\b|_)`, 'i'));
const NSFW_REGEXES = NSFW_WORDS.map(
// We use `(\b|_|-|\s|^)` instead of just `(\b|_)` because the word boundary doesn't
// match next to the `+` sign in `18+`
(word) => new RegExp(`(\\b|_|-|\\s|^)${escapeStringRegexp(word)}(\\b|_|-|\\s|$)`, 'i')
);
// A very basic check for NSFW content that just looks for some keywords in the given
// text

View File

@ -3,7 +3,10 @@
const urlJoin = require('url-join');
const assert = require('matrix-public-archive-shared/lib/assert');
const { TIME_PRECISION_VALUES } = require('matrix-public-archive-shared/lib/reference-values');
const {
DIRECTION,
TIME_PRECISION_VALUES,
} = require('matrix-public-archive-shared/lib/reference-values');
function qsToUrlPiece(qs) {
if (qs.toString()) {
@ -25,7 +28,16 @@ class URLCreator {
return `https://matrix.to/#/${roomIdOrAlias}`;
}
roomDirectoryUrl({ searchTerm, homeserver, paginationToken } = {}) {
roomDirectoryUrl({ searchTerm, homeserver, paginationToken, direction } = {}) {
// You must provide both `paginationToken` and `direction` if either is defined
if (paginationToken || direction) {
assert(
[DIRECTION.forward, DIRECTION.backward].includes(direction),
'direction must be [f|b]'
);
assert(paginationToken);
}
let qs = new URLSearchParams();
if (searchTerm) {
qs.append('search', searchTerm);
@ -36,6 +48,9 @@ class URLCreator {
if (paginationToken) {
qs.append('page', paginationToken);
}
if (direction) {
qs.append('dir', direction);
}
return `${this._basePath}${qsToUrlPiece(qs)}`;
}

View File

@ -75,6 +75,7 @@ class ArchiveRoomViewModel extends ViewModel {
events,
stateEventMap,
shouldIndex,
historyVisibilityEventMeta,
basePath,
} = options;
assert(homeserverUrl);
@ -85,6 +86,9 @@ class ArchiveRoomViewModel extends ViewModel {
assert(events);
assert(stateEventMap);
assert(shouldIndex !== undefined);
assert(historyVisibilityEventMeta.historyVisibility);
assert(historyVisibilityEventMeta.sender);
assert(historyVisibilityEventMeta.originServerTs);
assert(events);
this._room = room;
@ -213,6 +217,7 @@ class ArchiveRoomViewModel extends ViewModel {
shouldShowTimeSelector,
timeSelectorViewModel: this._timeSelectorViewModel,
shouldIndex,
historyVisibilityEventMeta,
get developerOptionsUrl() {
return urlRouter.urlForSegments([
navigation.segment('room', room.id),

View File

@ -9,6 +9,7 @@ const ModalViewModel = require('matrix-public-archive-shared/viewmodels/ModalVie
const HomeserverSelectionModalContentViewModel = require('matrix-public-archive-shared/viewmodels/HomeserverSelectionModalContentViewModel');
const RoomCardViewModel = require('matrix-public-archive-shared/viewmodels/RoomCardViewModel');
const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw');
const { DIRECTION } = require('../lib/reference-values');
const DEFAULT_SERVER_LIST = ['matrix.org', 'gitter.im'];
@ -304,6 +305,7 @@ class RoomDirectoryViewModel extends ViewModel {
homeserver: this.homeserverSelection,
searchTerm: this.searchTerm,
paginationToken: this._nextPaginationToken,
direction: DIRECTION.forward,
});
}
@ -316,6 +318,7 @@ class RoomDirectoryViewModel extends ViewModel {
homeserver: this.homeserverSelection,
searchTerm: this.searchTerm,
paginationToken: this._prevPaginationToken,
direction: DIRECTION.backward,
});
}

View File

@ -10,12 +10,28 @@ class RightPanelContentView extends TemplateView {
render(t, vm) {
assert(vm.shouldIndex !== undefined);
assert(vm.shouldShowTimeSelector !== undefined);
assert(vm.historyVisibilityEventMeta.historyVisibility);
assert(vm.historyVisibilityEventMeta.sender);
assert(vm.historyVisibilityEventMeta.originServerTs);
let maybeIndexedMessage = 'This room is not being indexed by search engines ';
if (vm.shouldIndex) {
maybeIndexedMessage = 'This room is being indexed by search engines ';
maybeIndexedMessage = 'This room is being indexed by search engines';
}
const historyVisibilitySender = vm.historyVisibilityEventMeta.sender;
let historyVisibilityDisplayValue = vm.historyVisibilityEventMeta.historyVisibility;
if (vm.historyVisibilityEventMeta.historyVisibility === 'world_readable' || vm.historyVisibilityEventMeta.historyVisibility === 'shared') {
historyVisibilityDisplayValue = vm.historyVisibilityEventMeta.historyVisibility.replace('_', ' ');
}
const [historyVisibilitySetDatePiece, _timePiece] = new Date(
vm.historyVisibilityEventMeta.originServerTs
)
.toISOString()
.split('T');
return t.div(
{
className: 'RightPanelContentView',
@ -33,9 +49,13 @@ class RightPanelContentView extends TemplateView {
className: 'RightPanelContentView_footer',
},
[
t.p([
`This room is accessible in the archive because it was set to ` +
`${historyVisibilityDisplayValue} by ${historyVisibilitySender} on ${historyVisibilitySetDatePiece}.`,
]),
t.p([
maybeIndexedMessage,
'(',
' (',
t.a(
{
className: 'external-link RightPanelContentView_footerLink',

View File

@ -322,10 +322,21 @@ class RoomDirectoryView extends TemplateView {
t.view(roomList),
t.div({ className: 'RoomDirectoryView_paginationButtonCombo' }, [
t.a(
{ className: 'RoomDirectoryView_paginationButton', href: vm.prevPageUrl },
{
className: 'RoomDirectoryView_paginationButton',
href: vm.prevPageUrl,
'data-testid': 'room-directory-prev-link',
},
'Previous'
),
t.a({ className: 'RoomDirectoryView_paginationButton', href: vm.nextPageUrl }, 'Next'),
t.a(
{
className: 'RoomDirectoryView_paginationButton',
href: vm.nextPageUrl,
'data-testid': 'room-directory-next-link',
},
'Next'
),
]),
]),
t.if(

View File

@ -3,12 +3,7 @@
#
# Currently this is based on Complement Synapse images which are based on the
# published 'synapse:latest' image -- ie, the most recent Synapse release.
# FIXME: We're pinning the version to `v1.79.0` until
# https://github.com/matrix-org/synapse/issues/15526 is fixed. Feel free to update back
# to `latest` once that issue is resolved. More context:
# https://github.com/matrix-org/matrix-public-archive/pull/208#discussion_r1183294630
ARG SYNAPSE_VERSION=v1.79.0
ARG SYNAPSE_VERSION=latest
FROM matrixdotorg/synapse:${SYNAPSE_VERSION}

View File

@ -38,6 +38,7 @@ const {
sendMessage,
createMessagesInRoom,
getMessagesInRoom,
waitForResultsInHomeserverRoomDirectory,
updateProfile,
uploadContent,
} = require('./test-utils/client-utils');
@ -2507,15 +2508,33 @@ describe('matrix-public-archive', () => {
// test runs against the same homeserver
const timeToken = Date.now();
const roomPlanetPrefix = `planet-${timeToken}`;
const roomSaturnName = `${roomPlanetPrefix}-saturn`;
const roomSaturnId = await createTestRoom(client, {
name: `${roomPlanetPrefix}-saturn`,
name: roomSaturnName,
});
const roomMarsName = `${roomPlanetPrefix}-mars`;
const roomMarsId = await createTestRoom(client, {
name: `${roomPlanetPrefix}-mars`,
name: roomMarsName,
});
// Browse the room directory without search to see many rooms
//
// (we set this here in case we timeout while waiting for the test rooms to
// appear in the room directory)
archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl();
// Try to avoid flakey tests where the homeserver hasn't added the rooms to the
// room directory yet. This isn't completely robust as it doesn't check that the
// random room at the start is in the directory but should be good enough.
await waitForResultsInHomeserverRoomDirectory({
client,
searchTerm: roomSaturnName,
});
await waitForResultsInHomeserverRoomDirectory({
client,
searchTerm: roomMarsName,
});
const { data: roomDirectoryPageHtml } = await fetchEndpointAsText(archiveUrl);
const dom = parseHTML(roomDirectoryPageHtml);
@ -2556,17 +2575,33 @@ describe('matrix-public-archive', () => {
// test runs against the same homeserver
const timeToken = Date.now();
const roomPlanetPrefix = `remote-planet-${timeToken}`;
const roomXName = `${roomPlanetPrefix}-x`;
const roomXId = await createTestRoom(hs2Client, {
name: `${roomPlanetPrefix}-x`,
name: roomXName,
});
const roomYname = `${roomPlanetPrefix}-y`;
const roomYId = await createTestRoom(hs2Client, {
name: `${roomPlanetPrefix}-y`,
name: roomYname,
});
// (we set this here in case we timeout while waiting for the test rooms to
// appear in the room directory)
archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl({
homeserver: HOMESERVER_URL_TO_PRETTY_NAME_MAP[testMatrixServerUrl2],
searchTerm: roomPlanetPrefix,
});
// Try to avoid flakey tests where the homeserver hasn't added the rooms to the
// room directory yet.
await waitForResultsInHomeserverRoomDirectory({
client: hs2Client,
searchTerm: roomXName,
});
await waitForResultsInHomeserverRoomDirectory({
client: hs2Client,
searchTerm: roomYname,
});
const { data: roomDirectoryWithSearchPageHtml } = await fetchEndpointAsText(archiveUrl);
const domWithSearch = parseHTML(roomDirectoryWithSearchPageHtml);
@ -2601,21 +2636,39 @@ describe('matrix-public-archive', () => {
// test runs against the same homeserver
const timeToken = Date.now();
const roomPlanetPrefix = `planet-${timeToken}`;
const roomUranusName = `${roomPlanetPrefix}-uranus-nsfw`;
const roomUranusId = await createTestRoom(client, {
// NSFW in title
name: `${roomPlanetPrefix}-uranus-nsfw`,
name: roomUranusName,
});
const roomMarsName = `${roomPlanetPrefix}-mars`;
const roomMarsId = await createTestRoom(client, {
name: `${roomPlanetPrefix}-mars`,
name: roomMarsName,
// NSFW in room topic/description
topic: 'Get your ass to mars (NSFW)',
});
// Browse the room directory searching the room directory for those NSFW rooms
// (narrowing down results).
//
// (we set this here in case we timeout while waiting for the test rooms to
// appear in the room directory)
archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl({
searchTerm: roomPlanetPrefix,
});
// Try to avoid flakey tests where the homeserver hasn't added the rooms to the
// room directory yet. This isn't completely robust as it doesn't check that the
// random room at the start is in the directory but should be good enough.
await waitForResultsInHomeserverRoomDirectory({
client,
searchTerm: roomUranusName,
});
await waitForResultsInHomeserverRoomDirectory({
client,
searchTerm: roomMarsName,
});
const { data: roomDirectoryWithSearchPageHtml } = await fetchEndpointAsText(archiveUrl);
const domWithSearch = parseHTML(roomDirectoryWithSearchPageHtml);
@ -2644,6 +2697,173 @@ describe('matrix-public-archive', () => {
);
});
});
it('pagination is seamless', async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
// We use a `timeToken` so that we can namespace these rooms away from other
// test runs against the same homeserver
const timeToken = Date.now();
const roomPlanetPrefix = `planet-${timeToken}`;
// Fill up the room room directory with multiple pages of rooms
const visibleRoomConfigurations = [];
const roomsConfigurationsToCreate = [];
for (let i = 0; i < 40; i++) {
const roomCreateOptions = {
name: `${roomPlanetPrefix}-room-${i}`,
};
// Sprinkle in some rooms every so often that should not appear in the room directory
if (i % 3 === 0) {
roomCreateOptions.name = `${roomPlanetPrefix}-room-not-world-readable-${i}`;
roomCreateOptions.initial_state = [
{
type: 'm.room.history_visibility',
state_key: '',
content: {
history_visibility: 'joined',
},
},
{
type: 'm.room.topic',
state_key: '',
content: {
// Just a specific token we can search for in the DOM to make sure
// this room does not appear in the room directory.
topic: 'should-not-be-visible-in-archive-room-directory',
},
},
];
} else {
visibleRoomConfigurations.push(roomCreateOptions);
}
roomsConfigurationsToCreate.push(roomCreateOptions);
}
// Doing all of these create room requests in parallel is about 2x faster than
// doing them serially and the room directory doesn't return the rooms in any
// particular order so it doesn't make the test any more clear doing them
// serially anyway.
const createdRoomsIds = await Promise.all(
roomsConfigurationsToCreate.map((roomCreateOptions) =>
createTestRoom(client, roomCreateOptions)
)
);
function roomIdToRoomName(expectedRoomId) {
const roomIndex = createdRoomsIds.findIndex((roomId) => {
return roomId === expectedRoomId;
});
assert(
roomIndex > 0,
`Expected to find expectedRoomId=${expectedRoomId} in the list of created rooms createdRoomsIds=${createdRoomsIds}`
);
const roomConfig = roomsConfigurationsToCreate[roomIndex];
assert(
roomConfig,
`Expected to find room config for roomIndex=${roomIndex} in the list of roomsConfigurationsToCreate (length ${roomsConfigurationsToCreate.length})}`
);
return roomConfig.name;
}
async function checkRoomsOnPage(archiveUrl) {
const { data: roomDirectoryWithSearchPageHtml } = await fetchEndpointAsText(archiveUrl);
const dom = parseHTML(roomDirectoryWithSearchPageHtml);
const roomsCardsOnPageWithSearch = [
...dom.document.querySelectorAll(`[data-testid="room-card"]`),
];
const roomsIdsOnPage = roomsCardsOnPageWithSearch.map((roomCardEl) => {
return roomCardEl.getAttribute('data-room-id');
});
// Sanity check that we don't see any non-world_readable rooms.
roomsCardsOnPageWithSearch.forEach((roomCardEl) => {
assert.match(
roomCardEl.innerHTML,
/^((?!should-not-be-visible-in-archive-room-directory).)*$/,
`Expected not to see any non-world_readable rooms on the page but saw ${roomCardEl.getAttribute(
'data-room-id'
)} which has "should-not-be-visible-in-archive-room-directory" in the room topic`
);
});
// Find the pagination buttons and grab the links to the previous and next pages
const previousLinkElement = dom.document.querySelector(
`[data-testid="room-directory-prev-link"]`
);
const nextLinkElement = dom.document.querySelector(
`[data-testid="room-directory-next-link"]`
);
const previousPaginationLink = previousLinkElement.getAttribute('href');
const nextPaginationLink = nextLinkElement.getAttribute('href');
return {
archiveUrl,
roomsIdsOnPage,
previousPaginationLink,
nextPaginationLink,
};
}
// Browse the room directory with the search prefix so we only see rooms
// relevant to this test.
//
// (we set this here in case we timeout while waiting for the test rooms to
// appear in the room directory)
archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl({
searchTerm: roomPlanetPrefix,
});
// Try to avoid flakey tests where the homeserver hasn't added the rooms
// to the room directory yet. This isn't completely robust as it doesn't check
// that all rooms are visible but it's better than nothing.
await waitForResultsInHomeserverRoomDirectory({
client,
searchTerm: visibleRoomConfigurations[0].name,
});
await waitForResultsInHomeserverRoomDirectory({
client,
searchTerm: visibleRoomConfigurations[visibleRoomConfigurations.length - 1].name,
});
// Visit a sequence of pages using the pagination links: 1 -> 2 -> 3 -> 2 -> 1
const firstPage = await checkRoomsOnPage(archiveUrl);
const secondPage = await checkRoomsOnPage(firstPage.nextPaginationLink);
const thirdPage = await checkRoomsOnPage(secondPage.nextPaginationLink);
const backtrackSecondPage = await checkRoomsOnPage(thirdPage.previousPaginationLink);
const backtrackFirstPage = await checkRoomsOnPage(
backtrackSecondPage.previousPaginationLink
);
// Ensure that we saw all of the visible rooms paginating through the directory
assert.deepStrictEqual(
[...firstPage.roomsIdsOnPage, ...secondPage.roomsIdsOnPage, ...thirdPage.roomsIdsOnPage]
.map(roomIdToRoomName)
.sort(),
visibleRoomConfigurations.map((roomConfig) => roomConfig.name).sort(),
'Make sure we saw all visible rooms paginating through the directory'
);
// Ensure that we see the same rooms in the same order going backward that we saw going forward
archiveUrl = backtrackSecondPage.archiveUrl;
assert.deepStrictEqual(
backtrackSecondPage.roomsIdsOnPage.map(roomIdToRoomName),
secondPage.roomsIdsOnPage.map(roomIdToRoomName),
'From the third page, going backward to second page should show the same rooms that we saw on the second page when going forward'
);
archiveUrl = backtrackFirstPage.archiveUrl;
assert.deepStrictEqual(
backtrackFirstPage.roomsIdsOnPage.map(roomIdToRoomName),
firstPage.roomsIdsOnPage.map(roomIdToRoomName),
'From the second page, going backward to first page should show the same rooms that we saw on first page when going forward'
);
});
});
describe('access controls', () => {
@ -2688,15 +2908,50 @@ describe('matrix-public-archive', () => {
assert.strictEqual(dom.document.querySelector(`meta[name="robots"]`), null);
});
it('search engines not allowed to index `public` room', async () => {
it('search engines not allowed to access public room with non-`world_readable` history visibility', async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client, {
// The default options for the test rooms adds a
// `m.room.history_visiblity` state event so we override that here so
// it's only a public room.
initial_state: [],
// Set as `shared` since it's the next most permissive history visibility
// after `world_readable` but still not allowed to be accesible in the
// archive.
initial_state: [
{
type: 'm.room.history_visibility',
state_key: '',
content: {
history_visibility: 'shared',
},
},
],
});
try {
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId);
await fetchEndpointAsText(archiveUrl);
assert.fail(
new TestError(
'We expect the request to fail with a 403 since the archive should not be able to view a non-world_readable room but it succeeded'
)
);
} catch (err) {
if (err instanceof TestError) {
throw err;
}
assert.strictEqual(
err.response.status,
403,
`Expected err.response.status=${err?.response?.status} to be 403 but error was: ${err.stack}`
);
}
});
it('Configuring `stopSearchEngineIndexing` will stop search engine indexing', async () => {
// Disable search engine indexing across the entire instance
config.set('stopSearchEngineIndexing', true);
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client);
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId);
const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl);

View File

@ -13,6 +13,11 @@ describe('checkTextForNsfw', () => {
NSFW_foo: true,
'NSFW-foo': true,
'NSFW:foo': true,
'18+ only': true,
// Previous false positives that we ran into in the wild that should not be flagged
// as NSFW
'1888-great-blizzard': false,
'argon-18-element': false,
}).forEach(([inputText, expectedNsfw]) => {
it(`should return ${expectedNsfw} for '${inputText}'`, () => {
assert.strictEqual(

View File

@ -4,6 +4,8 @@ const assert = require('assert');
const urlJoin = require('url-join');
const { fetchEndpointAsJson, fetchEndpoint } = require('../../server/lib/fetch-endpoint');
const getServerNameFromMatrixRoomIdOrAlias = require('../../server/lib/matrix-utils/get-server-name-from-matrix-room-id-or-alias');
const { MS_LOOKUP } = require('matrix-public-archive-shared/lib/reference-values');
const { ONE_SECOND_IN_MS } = MS_LOOKUP;
const config = require('../../server/lib/config');
const matrixAccessToken = config.get('matrixAccessToken');
@ -14,7 +16,7 @@ assert(testMatrixServerUrl1);
let txnCount = 0;
function getTxnId() {
txnCount++;
return `${new Date().getTime()}--${txnCount}`;
return `txn${txnCount}-${new Date().getTime()}`;
}
// Basic slugify function, plenty of edge cases and should not be used for
@ -150,7 +152,7 @@ async function createTestRoom(client, overrideCreateOptions = {}) {
}
const roomName = overrideCreateOptions.name || 'the hangout spot';
const roomAlias = slugify(roomName + getTxnId());
const roomAlias = slugify(roomName + '-' + getTxnId());
const { data: createRoomResponse } = await fetchEndpointAsJson(
urlJoin(client.homeserverUrl, `/_matrix/client/v3/createRoom?${qs.toString()}`),
@ -421,6 +423,50 @@ async function uploadContent({ client, roomId, data, fileName, contentType }) {
return mxcUri;
}
// This can be removed after https://github.com/matrix-org/synapse/issues/15526 is solved
async function waitForResultsInHomeserverRoomDirectory({
client,
searchTerm,
timeoutMs = 10 * ONE_SECOND_IN_MS,
}) {
assert(client);
assert(searchTerm !== undefined);
const roomDirectoryEndpoint = urlJoin(client.homeserverUrl, `_matrix/client/v3/publicRooms`);
// eslint-disable-next-line no-async-promise-executor
await new Promise(async (resolve, reject) => {
try {
setTimeout(() => {
reject(new Error('Timed out waiting for rooms to appear in the room directory'));
}, timeoutMs);
let foundResults = false;
while (!foundResults) {
const { data: publicRoomsRes } = await fetchEndpointAsJson(roomDirectoryEndpoint, {
method: 'POST',
body: {
include_all_networks: true,
filter: {
generic_search_term: searchTerm,
},
limit: 1,
},
accessToken: client.accessToken,
});
if (publicRoomsRes.chunk.length > 0) {
foundResults = true;
resolve();
break;
}
}
} catch (err) {
reject(err);
}
});
}
module.exports = {
ensureUserRegistered,
getTestClientForAs,
@ -433,6 +479,7 @@ module.exports = {
sendMessage,
createMessagesInRoom,
getMessagesInRoom,
waitForResultsInHomeserverRoomDirectory,
updateProfile,
uploadContent,
};