From 32c77ecffe47ff78ea7c77803cc9e12bcd8a7607 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 8 Sep 2022 19:15:07 -0500 Subject: [PATCH] Only show `world_readable` or `public` rooms in the archive. Only index `world_readable` (#66) Only show `world_readable` or `public` rooms in the archive. Only allow `world_readable` rooms to be indexed by search engines. Related to https://github.com/matrix-org/matrix-public-archive/issues/47 --- ...-hydrogen-vm-render-script-to-page-html.js | 7 +++ server/lib/matrix-utils/ensure-room-joined.js | 17 +++--- server/lib/matrix-utils/fetch-room-data.js | 33 ++++++++++- server/lib/status-error.js | 2 +- server/routes/room-routes.js | 14 +++++ test/client-utils.js | 3 +- test/e2e-tests.js | 56 +++++++++++++++++++ 7 files changed, 121 insertions(+), 11 deletions(-) diff --git a/server/hydrogen-render/render-hydrogen-vm-render-script-to-page-html.js b/server/hydrogen-render/render-hydrogen-vm-render-script-to-page-html.js index ef28a87..91a6b63 100644 --- a/server/hydrogen-render/render-hydrogen-vm-render-script-to-page-html.js +++ b/server/hydrogen-render/render-hydrogen-vm-render-script-to-page-html.js @@ -27,11 +27,18 @@ async function renderHydrogenVmRenderScriptToPageHtml( const serializableSpans = getSerializableSpans(); const serializedSpans = JSON.stringify(serializableSpans); + // We shouldn't let some pages be indexed by search engines + let maybeNoIndexHtml = ''; + if (pageOptions.noIndex) { + maybeNoIndexHtml = ``; + } + const pageHtml = ` + ${maybeNoIndexHtml} ${sanitizeHtml(`${pageOptions.title}`)} ${pageOptions.styles .map((styleUrl) => ``) diff --git a/server/lib/matrix-utils/ensure-room-joined.js b/server/lib/matrix-utils/ensure-room-joined.js index 82440fd..78d8df0 100644 --- a/server/lib/matrix-utils/ensure-room-joined.js +++ b/server/lib/matrix-utils/ensure-room-joined.js @@ -6,6 +6,7 @@ const urlJoin = require('url-join'); const { fetchEndpointAsJson } = require('../fetch-endpoint'); const config = require('../config'); +const StatusError = require('../status-error'); const matrixServerUrl = config.get('matrixServerUrl'); assert(matrixServerUrl); @@ -15,18 +16,18 @@ async function ensureRoomJoined(accessToken, roomId, viaServers = []) { qs.append('server_name', viaServer); }); - // TODO: Only join world_readable rooms. Perhaps we want to serve public rooms - // where we have been invited. GET - // /_matrix/client/v3/directory/list/room/{roomId} (Gets the visibility of a - // given room on the server’s public room directory.) const joinEndpoint = urlJoin( matrixServerUrl, `_matrix/client/r0/join/${roomId}?${qs.toString()}` ); - await fetchEndpointAsJson(joinEndpoint, { - method: 'POST', - accessToken, - }); + try { + await fetchEndpointAsJson(joinEndpoint, { + method: 'POST', + accessToken, + }); + } catch (err) { + throw new StatusError(403, `Archiver is unable to join room: ${err.message}`); + } } module.exports = ensureRoomJoined; diff --git a/server/lib/matrix-utils/fetch-room-data.js b/server/lib/matrix-utils/fetch-room-data.js index 05bfdb3..5fb9435 100644 --- a/server/lib/matrix-utils/fetch-room-data.js +++ b/server/lib/matrix-utils/fetch-room-data.js @@ -22,14 +22,33 @@ async function fetchRoomData(accessToken, roomId) { matrixServerUrl, `_matrix/client/r0/rooms/${roomId}/state/m.room.avatar` ); + const stateHistoryVisibilityEndpoint = urlJoin( + matrixServerUrl, + `_matrix/client/r0/rooms/${roomId}/state/m.room.history_visibility` + ); + const stateJoinRulesEndpoint = urlJoin( + matrixServerUrl, + `_matrix/client/r0/rooms/${roomId}/state/m.room.join_rules` + ); - const [stateNameResDataOutcome, stateAvatarResDataOutcome] = await Promise.allSettled([ + const [ + stateNameResDataOutcome, + stateAvatarResDataOutcome, + stateHistoryVisibilityResDataOutcome, + stateJoinRulesResDataOutcome, + ] = await Promise.allSettled([ fetchEndpointAsJson(stateNameEndpoint, { accessToken, }), fetchEndpointAsJson(stateAvatarEndpoint, { accessToken, }), + fetchEndpointAsJson(stateHistoryVisibilityEndpoint, { + accessToken, + }), + fetchEndpointAsJson(stateJoinRulesEndpoint, { + accessToken, + }), ]); let name; @@ -42,10 +61,22 @@ async function fetchRoomData(accessToken, roomId) { avatarUrl = stateAvatarResDataOutcome.value.url; } + let historyVisibility; + if (stateHistoryVisibilityResDataOutcome.reason === undefined) { + historyVisibility = stateHistoryVisibilityResDataOutcome.value.history_visibility; + } + + let joinRule; + if (stateJoinRulesResDataOutcome.reason === undefined) { + joinRule = stateJoinRulesResDataOutcome.value.join_rule; + } + return { id: roomId, name, avatarUrl, + historyVisibility, + joinRule, }; } diff --git a/server/lib/status-error.js b/server/lib/status-error.js index bac4b11..3d34e39 100644 --- a/server/lib/status-error.js +++ b/server/lib/status-error.js @@ -9,7 +9,7 @@ function StatusError(status, inputMessage) { message = http.STATUS_CODES[status] || http.STATUS_CODES['500']; } - this.message = message; + this.message = `${status} - ${message}`; this.status = status; this.name = 'StatusError'; Error.captureStackTrace(this, StatusError); diff --git a/server/routes/room-routes.js b/server/routes/room-routes.js index ad27056..094d452 100644 --- a/server/routes/room-routes.js +++ b/server/routes/room-routes.js @@ -176,6 +176,18 @@ router.get( ), ]); + // Only `world_readable` or `shared` rooms that are `public` are viewable in the archive + const allowedToViewRoom = + roomData?.historyVisibility === 'world_readable' || + (roomData?.historyVisibility === 'shared' && roomData?.joinRule === 'public'); + + if (!allowedToViewRoom) { + throw new StatusError( + 403, + `Only \`world_readable\` or \`shared\` rooms that are \`public\` can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData?.historyVisibility} m.room.join_rules=${roomData?.joinRule}` + ); + } + if (events.length >= archiveMessageLimit) { throw new Error('TODO: Redirect user to smaller hour range'); } @@ -200,6 +212,8 @@ router.get( title: `${roomData.name} - Matrix Public Archive`, styles: [hydrogenStylesUrl, stylesUrl], scripts: [jsBundleUrl], + // We only allow search engines to index `world_readable` rooms + noIndex: roomData?.historyVisibility !== `world_readable`, } ); diff --git a/test/client-utils.js b/test/client-utils.js index edb42cb..b13e7b3 100644 --- a/test/client-utils.js +++ b/test/client-utils.js @@ -73,7 +73,7 @@ async function getTestClientForHs(testMatrixServerUrl) { } // Create a public room to test in -async function createTestRoom(client) { +async function createTestRoom(client, overrideCreateOptions) { let qs = new URLSearchParams(); if (client.applicationServiceUserIdOverride) { qs.append('user_id', client.applicationServiceUserIdOverride); @@ -95,6 +95,7 @@ async function createTestRoom(client) { }, }, ], + ...overrideCreateOptions, }, accessToken: client.accessToken, } diff --git a/test/e2e-tests.js b/test/e2e-tests.js index f824bd9..0522af1 100644 --- a/test/e2e-tests.js +++ b/test/e2e-tests.js @@ -506,5 +506,61 @@ describe('matrix-public-archive', () => { it( `will render a room with a sparse amount of messages (a few per day) with no contamination between days` ); + + describe('access controls', () => { + it('not allowed to view private room even when the archiver user is in the room', async () => { + const client = await getTestClientForHs(testMatrixServerUrl1); + const roomId = await createTestRoom(client, { + preset: 'private_chat', + initial_state: [], + }); + + try { + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId); + await fetchEndpointAsText(archiveUrl); + assert.fail( + 'We expect the request to fail with a 403 since the archive should not be able to view a private room' + ); + } catch (err) { + assert.strictEqual(err.response.status, 403); + } + }); + + it('search engines allowed to index `world_readable` room', async () => { + const client = await getTestClientForHs(testMatrixServerUrl1); + const roomId = await createTestRoom(client); + + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId); + const archivePageHtml = await fetchEndpointAsText(archiveUrl); + + const dom = parseHTML(archivePageHtml); + + // Make sure the `` tag does NOT exist on the + // page telling search engines not to index it + assert.strictEqual(dom.document.querySelector(`meta[name="robots"]`), null); + }); + + it('search engines not allowed to index `public` room', async () => { + const client = await getTestClientForHs(testMatrixServerUrl1); + const roomId = await createTestRoom(client, { + // The default options for the test rooms adds a + // `m.room.history_visiblity` state event so we override that here so + // it's only a public room. + initial_state: [], + }); + + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId); + const archivePageHtml = await fetchEndpointAsText(archiveUrl); + + const dom = parseHTML(archivePageHtml); + + // Make sure the `` tag exists on the page + // telling search engines not to index it + assert.strictEqual( + dom.document.querySelector(`meta[name="robots"]`)?.getAttribute('content'), + 'noindex, nofollow' + ); + }); + }); }); });