Only show `world_readable` or `public` rooms in the archive. Only index `world_readable` (#66)

Only show `world_readable` or `public` rooms in the archive. Only allow `world_readable` rooms to be indexed by search engines.

Related to https://github.com/matrix-org/matrix-public-archive/issues/47
This commit is contained in:
Eric Eastwood 2022-09-08 19:15:07 -05:00 committed by GitHub
parent 65a371910a
commit 32c77ecffe
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 121 additions and 11 deletions

View File

@ -27,11 +27,18 @@ async function renderHydrogenVmRenderScriptToPageHtml(
const serializableSpans = getSerializableSpans();
const serializedSpans = JSON.stringify(serializableSpans);
// We shouldn't let some pages be indexed by search engines
let maybeNoIndexHtml = '';
if (pageOptions.noIndex) {
maybeNoIndexHtml = `<meta name="robots" content="noindex, nofollow" />`;
}
const pageHtml = `
<!doctype html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
${maybeNoIndexHtml}
${sanitizeHtml(`<title>${pageOptions.title}</title>`)}
${pageOptions.styles
.map((styleUrl) => `<link href="${styleUrl}" rel="stylesheet">`)

View File

@ -6,6 +6,7 @@ const urlJoin = require('url-join');
const { fetchEndpointAsJson } = require('../fetch-endpoint');
const config = require('../config');
const StatusError = require('../status-error');
const matrixServerUrl = config.get('matrixServerUrl');
assert(matrixServerUrl);
@ -15,18 +16,18 @@ async function ensureRoomJoined(accessToken, roomId, viaServers = []) {
qs.append('server_name', viaServer);
});
// TODO: Only join world_readable rooms. Perhaps we want to serve public rooms
// where we have been invited. GET
// /_matrix/client/v3/directory/list/room/{roomId} (Gets the visibility of a
// given room on the servers public room directory.)
const joinEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/r0/join/${roomId}?${qs.toString()}`
);
await fetchEndpointAsJson(joinEndpoint, {
method: 'POST',
accessToken,
});
try {
await fetchEndpointAsJson(joinEndpoint, {
method: 'POST',
accessToken,
});
} catch (err) {
throw new StatusError(403, `Archiver is unable to join room: ${err.message}`);
}
}
module.exports = ensureRoomJoined;

View File

@ -22,14 +22,33 @@ async function fetchRoomData(accessToken, roomId) {
matrixServerUrl,
`_matrix/client/r0/rooms/${roomId}/state/m.room.avatar`
);
const stateHistoryVisibilityEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/r0/rooms/${roomId}/state/m.room.history_visibility`
);
const stateJoinRulesEndpoint = urlJoin(
matrixServerUrl,
`_matrix/client/r0/rooms/${roomId}/state/m.room.join_rules`
);
const [stateNameResDataOutcome, stateAvatarResDataOutcome] = await Promise.allSettled([
const [
stateNameResDataOutcome,
stateAvatarResDataOutcome,
stateHistoryVisibilityResDataOutcome,
stateJoinRulesResDataOutcome,
] = await Promise.allSettled([
fetchEndpointAsJson(stateNameEndpoint, {
accessToken,
}),
fetchEndpointAsJson(stateAvatarEndpoint, {
accessToken,
}),
fetchEndpointAsJson(stateHistoryVisibilityEndpoint, {
accessToken,
}),
fetchEndpointAsJson(stateJoinRulesEndpoint, {
accessToken,
}),
]);
let name;
@ -42,10 +61,22 @@ async function fetchRoomData(accessToken, roomId) {
avatarUrl = stateAvatarResDataOutcome.value.url;
}
let historyVisibility;
if (stateHistoryVisibilityResDataOutcome.reason === undefined) {
historyVisibility = stateHistoryVisibilityResDataOutcome.value.history_visibility;
}
let joinRule;
if (stateJoinRulesResDataOutcome.reason === undefined) {
joinRule = stateJoinRulesResDataOutcome.value.join_rule;
}
return {
id: roomId,
name,
avatarUrl,
historyVisibility,
joinRule,
};
}

View File

@ -9,7 +9,7 @@ function StatusError(status, inputMessage) {
message = http.STATUS_CODES[status] || http.STATUS_CODES['500'];
}
this.message = message;
this.message = `${status} - ${message}`;
this.status = status;
this.name = 'StatusError';
Error.captureStackTrace(this, StatusError);

View File

@ -176,6 +176,18 @@ router.get(
),
]);
// Only `world_readable` or `shared` rooms that are `public` are viewable in the archive
const allowedToViewRoom =
roomData?.historyVisibility === 'world_readable' ||
(roomData?.historyVisibility === 'shared' && roomData?.joinRule === 'public');
if (!allowedToViewRoom) {
throw new StatusError(
403,
`Only \`world_readable\` or \`shared\` rooms that are \`public\` can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData?.historyVisibility} m.room.join_rules=${roomData?.joinRule}`
);
}
if (events.length >= archiveMessageLimit) {
throw new Error('TODO: Redirect user to smaller hour range');
}
@ -200,6 +212,8 @@ router.get(
title: `${roomData.name} - Matrix Public Archive`,
styles: [hydrogenStylesUrl, stylesUrl],
scripts: [jsBundleUrl],
// We only allow search engines to index `world_readable` rooms
noIndex: roomData?.historyVisibility !== `world_readable`,
}
);

View File

@ -73,7 +73,7 @@ async function getTestClientForHs(testMatrixServerUrl) {
}
// Create a public room to test in
async function createTestRoom(client) {
async function createTestRoom(client, overrideCreateOptions) {
let qs = new URLSearchParams();
if (client.applicationServiceUserIdOverride) {
qs.append('user_id', client.applicationServiceUserIdOverride);
@ -95,6 +95,7 @@ async function createTestRoom(client) {
},
},
],
...overrideCreateOptions,
},
accessToken: client.accessToken,
}

View File

@ -506,5 +506,61 @@ describe('matrix-public-archive', () => {
it(
`will render a room with a sparse amount of messages (a few per day) with no contamination between days`
);
describe('access controls', () => {
it('not allowed to view private room even when the archiver user is in the room', async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client, {
preset: 'private_chat',
initial_state: [],
});
try {
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId);
await fetchEndpointAsText(archiveUrl);
assert.fail(
'We expect the request to fail with a 403 since the archive should not be able to view a private room'
);
} catch (err) {
assert.strictEqual(err.response.status, 403);
}
});
it('search engines allowed to index `world_readable` room', async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client);
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId);
const archivePageHtml = await fetchEndpointAsText(archiveUrl);
const dom = parseHTML(archivePageHtml);
// Make sure the `<meta name="robots" ...>` tag does NOT exist on the
// page telling search engines not to index it
assert.strictEqual(dom.document.querySelector(`meta[name="robots"]`), null);
});
it('search engines not allowed to index `public` room', async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client, {
// The default options for the test rooms adds a
// `m.room.history_visiblity` state event so we override that here so
// it's only a public room.
initial_state: [],
});
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId);
const archivePageHtml = await fetchEndpointAsText(archiveUrl);
const dom = parseHTML(archivePageHtml);
// Make sure the `<meta name="robots" ...>` tag exists on the page
// telling search engines not to index it
assert.strictEqual(
dom.document.querySelector(`meta[name="robots"]`)?.getAttribute('content'),
'noindex, nofollow'
);
});
});
});
});