From 1d3e930fbd57788ed60494a64835127ff97dbd53 Mon Sep 17 00:00:00 2001 From: Tulir Asokan Date: Wed, 28 Jun 2023 00:56:58 +0300 Subject: [PATCH 01/12] Don't allow previewing `shared` history rooms (#239) Only `world_readable` can be considered as opting into having history publicly on the web. Anything else must not be archived until there's a dedicated state event for opting into archiving. --- server/lib/matrix-utils/fetch-room-data.js | 12 ------ server/routes/room-directory-routes.js | 4 +- server/routes/room-routes.js | 12 +++--- test/e2e-tests.js | 45 +++++++++++++++++++--- 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/server/lib/matrix-utils/fetch-room-data.js b/server/lib/matrix-utils/fetch-room-data.js index 91c702e..e1ea483 100644 --- a/server/lib/matrix-utils/fetch-room-data.js +++ b/server/lib/matrix-utils/fetch-room-data.js @@ -155,7 +155,6 @@ const fetchRoomData = traceFunction(async function ( stateCanonicalAliasResDataOutcome, stateAvatarResDataOutcome, stateHistoryVisibilityResDataOutcome, - stateJoinRulesResDataOutcome, predecessorInfoOutcome, successorInfoOutcome, ] = await Promise.allSettled([ @@ -182,10 +181,6 @@ const fetchRoomData = traceFunction(async function ( abortSignal, } ), - fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.join_rules'), { - accessToken: matrixAccessToken, - abortSignal, - }), fetchPredecessorInfo(matrixAccessToken, roomId, { abortSignal }), fetchSuccessorInfo(matrixAccessToken, roomId, { abortSignal }), ]); @@ -220,12 +215,6 @@ const fetchRoomData = traceFunction(async function ( historyVisibility = data?.content?.history_visibility; } - let joinRule; - if (stateJoinRulesResDataOutcome.reason === undefined) { - const { data } = stateJoinRulesResDataOutcome.value; - joinRule = data?.content?.join_rule; - } - let roomCreationTs; let predecessorRoomId; let predecessorLastKnownEventId; @@ -251,7 +240,6 @@ const fetchRoomData = traceFunction(async function ( canonicalAlias, avatarUrl, historyVisibility, - joinRule, roomCreationTs, predecessorRoomId, predecessorLastKnownEventId, diff --git a/server/routes/room-directory-routes.js b/server/routes/room-directory-routes.js index 7b88671..ad87681 100644 --- a/server/routes/room-directory-routes.js +++ b/server/routes/room-directory-routes.js @@ -22,7 +22,6 @@ const matrixServerName = config.get('matrixServerName'); assert(matrixServerName); const matrixAccessToken = config.get('matrixAccessToken'); assert(matrixAccessToken); -const stopSearchEngineIndexing = config.get('stopSearchEngineIndexing'); const router = express.Router({ caseSensitive: true, @@ -71,7 +70,8 @@ router.get( } // We index the room directory unless the config says we shouldn't index anything - const shouldIndex = !stopSearchEngineIndexing; + const stopSearchEngineIndexingFromConfig = config.get('stopSearchEngineIndexing'); + const shouldIndex = !stopSearchEngineIndexingFromConfig; const pageOptions = { title: `Matrix Public Archive`, diff --git a/server/routes/room-routes.js b/server/routes/room-routes.js index c58dd7e..7417ac9 100644 --- a/server/routes/room-routes.js +++ b/server/routes/room-routes.js @@ -57,7 +57,6 @@ const matrixServerUrl = config.get('matrixServerUrl'); assert(matrixServerUrl); const matrixAccessToken = config.get('matrixAccessToken'); assert(matrixAccessToken); -const stopSearchEngineIndexing = config.get('stopSearchEngineIndexing'); const matrixPublicArchiveURLCreator = new MatrixPublicArchiveURLCreator(basePath); @@ -828,15 +827,13 @@ router.get( }), ]); - // Only `world_readable` or `shared` rooms that are `public` are viewable in the archive - const allowedToViewRoom = - roomData.historyVisibility === 'world_readable' || - (roomData.historyVisibility === 'shared' && roomData.joinRule === 'public'); + // Only `world_readable` rooms are viewable in the archive + const allowedToViewRoom = roomData.historyVisibility === 'world_readable'; if (!allowedToViewRoom) { throw new StatusError( 403, - `Only \`world_readable\` or \`shared\` rooms that are \`public\` can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility} m.room.join_rules=${roomData.joinRule}` + `Only \`world_readable\` rooms can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility}` ); } @@ -891,7 +888,8 @@ router.get( // Default to no indexing (safe default) let shouldIndex = false; - if (stopSearchEngineIndexing) { + const stopSearchEngineIndexingFromConfig = config.get('stopSearchEngineIndexing'); + if (stopSearchEngineIndexingFromConfig) { shouldIndex = false; } else { // Otherwise we only allow search engines to index `world_readable` rooms diff --git a/test/e2e-tests.js b/test/e2e-tests.js index f7b4bc2..b95425c 100644 --- a/test/e2e-tests.js +++ b/test/e2e-tests.js @@ -2688,15 +2688,50 @@ describe('matrix-public-archive', () => { assert.strictEqual(dom.document.querySelector(`meta[name="robots"]`), null); }); - it('search engines not allowed to index `public` room', async () => { + it('search engines not allowed to access public room with non-`world_readable` history visibility', async () => { const client = await getTestClientForHs(testMatrixServerUrl1); const roomId = await createTestRoom(client, { - // The default options for the test rooms adds a - // `m.room.history_visiblity` state event so we override that here so - // it's only a public room. - initial_state: [], + // Set as `shared` since it's the next most permissive history visibility + // after `world_readable` but still not allowed to be accesible in the + // archive. + initial_state: [ + { + type: 'm.room.history_visibility', + state_key: '', + content: { + history_visibility: 'shared', + }, + }, + ], }); + try { + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId); + await fetchEndpointAsText(archiveUrl); + assert.fail( + new TestError( + 'We expect the request to fail with a 403 since the archive should not be able to view a non-world_readable room but it succeeded' + ) + ); + } catch (err) { + if (err instanceof TestError) { + throw err; + } + assert.strictEqual( + err.response.status, + 403, + `Expected err.response.status=${err?.response?.status} to be 403 but error was: ${err.stack}` + ); + } + }); + + it('Configuring `stopSearchEngineIndexing` will stop search engine indexing', async () => { + // Disable search engine indexing across the entire instance + config.set('stopSearchEngineIndexing', true); + + const client = await getTestClientForHs(testMatrixServerUrl1); + const roomId = await createTestRoom(client); + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForRoom(roomId); const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl); From 2243db5fffe0d3b8267249f3c1b95ccb0e2a8572 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 27 Jun 2023 17:16:16 -0500 Subject: [PATCH 02/12] Fix eslint trying to look at `node_modules/` (#275) I'm not sure what exactly is causing the behavior change now besides that I am running on Linux. The quote fix around the path is from https://stackoverflow.com/questions/51021751/express-js-lint-gives-mistake `node_modules/` is already part of our `.eslintignore`. Previously: ```sh $ npm run lint > matrix-public-archive@0.1.0 lint > eslint **/*.js Oops! Something went wrong! :( ESLint: 8.37.0 You are linting "node_modules/ipaddr.js", but all of the files matching the glob pattern "node_modules/ipaddr.js" are ignored. If you don't want to lint these files, remove the pattern "node_modules/ipaddr.js" from the list of arguments passed to ESLint. If you do want to lint these files, try the following solutions: * Check your .eslintignore file, or the eslintIgnore property in package.json, to ensure that the files are not configured to be ignored. * Explicitly list the files from this glob that you'd like to lint on the command-line, rather than providing a glob as an argument. ``` --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 17e1bef..434c86a 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "url": "https://github.com/matrix-org/matrix-public-archive" }, "scripts": { - "lint": "eslint **/*.js", + "lint": "eslint \"**/*.js\"", "build": "node ./build-scripts/do-client-build.js", "start": "node server/server.js", "start-dev": "node server/start-dev.js", From 3df0f0080907dd3fb10350e8d1c0c018984206b1 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 27 Jun 2023 17:18:25 -0500 Subject: [PATCH 03/12] Prepare changelog with #239 and #275 --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e4078c..b96964c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ - Add `/faq` redirect, https://github.com/matrix-org/matrix-public-archive/pull/265 - Use `rel=canonical` link to de-duplicate event permalinks, https://github.com/matrix-org/matrix-public-archive/pull/266, https://github.com/matrix-org/matrix-public-archive/pull/269 - Prevent join event spam with stable `reason`, https://github.com/matrix-org/matrix-public-archive/pull/268 +- Don't allow previewing `shared` history rooms, https://github.com/matrix-org/matrix-public-archive/pull/239 + +Developer facing: + +- Fix eslint trying to look at `node_modules/`, https://github.com/matrix-org/matrix-public-archive/pull/275 # 0.1.0 - 2023-05-11 From ff18a46283eddea4e68ed079b108cd9b0c5a9122 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 28 Jun 2023 00:24:20 -0500 Subject: [PATCH 04/12] Attribute #239 author --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b96964c..1f0f926 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - Use `rel=canonical` link to de-duplicate event permalinks, https://github.com/matrix-org/matrix-public-archive/pull/266, https://github.com/matrix-org/matrix-public-archive/pull/269 - Prevent join event spam with stable `reason`, https://github.com/matrix-org/matrix-public-archive/pull/268 - Don't allow previewing `shared` history rooms, https://github.com/matrix-org/matrix-public-archive/pull/239 + - Contributed by [@tulir](https://github.com/tulir) Developer facing: From 3b378675c3d10d6f96f9c4026fab4a034f7148e5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 28 Jun 2023 18:14:31 -0500 Subject: [PATCH 05/12] Update FAQ to explain `world_readable` only (#277) Follow-up to https://github.com/matrix-org/matrix-public-archive/pull/239 --- docs/faq.md | 66 ++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 3c5fda0..6725558 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -19,54 +19,54 @@ messages from any given date and day-by-day navigation. ## Why did the archive bot join my room? -Only public Matrix rooms with `shared` or `world_readable` [history -visibility](https://spec.matrix.org/latest/client-server-api/#room-history-visibility) are -accessible in the Matrix Public Archive. In some clients like Element, the `shared` -option equates to "Members only (since the point in time of selecting this option)" and -`world_readable` to "Anyone" under the **room settings** -> **Security & Privacy** -> -**Who can read history?**. +Only Matrix rooms with `world_readable` [history +visibility](https://spec.matrix.org/latest/client-server-api/#room-history-visibility) +are accessible in the Matrix Public Archive and indexed by search engines. But the archive bot (`@archive:matrix.org`) will join any public room because it doesn't -know the history visibility without first joining. Any room without `world_readable` or -`shared` history visibility will lead a `403 Forbidden`. And if the public room is in -the room directory, it will be listed in the archive but will still lead to a `403 -Forbidden` in that case. +know the history visibility without first joining. Any room that doesn't have +`world_readable` history visibility will lead a `403 Forbidden`. The Matrix Public Archive doesn't hold onto any data (it's stateless) and requests the messages from the homeserver every time. The [archive.matrix.org](https://archive.matrix.org/) instance has some caching in place, 5 minutes for the current day, and 2 days for past content. -The Matrix Public Archive only allows rooms with `world_readable` history visibility to -be indexed by search engines. See the [opt -out](#how-do-i-opt-out-and-keep-my-room-from-being-indexed-by-search-engines) topic -below for more details. - -### Why does the archive user join rooms instead of browsing them as a guest? - -Guests require `m.room.guest_access` to access a room. Most public rooms do not allow -guests because even the `public_chat` preset when creating a room does not allow guest -access. Not being able to view most public rooms is the major blocker on being able to -use guest access. The idea is if I can view the messages from a Matrix client as a -random user, I should also be able to see the messages in the archive. - -Guest access is also a much different ask than read-only access since guests can also -send messages in the room which isn't always desirable. The archive bot is read-only and -does not send messages. +See the [opt out +section](#how-do-i-opt-out-and-keep-my-room-from-being-indexed-by-search-engines) below +for more details. ## How do I opt out and keep my room from being indexed by search engines? -Only public Matrix rooms with `shared` or `world_readable` history visibility are -accessible to view in the Matrix Public Archive. But only rooms with history visibility -set to `world_readable` are indexable by search engines. +Only Matrix rooms with `world_readable` [history +visibility](https://spec.matrix.org/latest/client-server-api/#room-history-visibility) +are accessible in the Matrix Public Archive and indexed by search engines. One easy way +to opt-out is to change your rooms history visibility to something else if you don't +intend for your room be world readable. -Also see https://github.com/matrix-org/matrix-public-archive/issues/47 to track better -opt out controls. +Dedicated opt-out controls are being tracked in +[#47](https://github.com/matrix-org/matrix-public-archive/issues/47). -As a workaround for [archive.matrix.org](https://archive.matrix.org/) today, you can ban -the `@archive:matrix.org` user if you don't want your room content to be shown in the +As a workaround for [archive.matrix.org](https://archive.matrix.org/), you can ban the +`@archive:matrix.org` user if you don't want your room content to be shown in the archive at all. +### Why does the archive user join rooms instead peeking in the room or using guests? + +Since the archive only displays rooms with `world_readable` history visibility, we could +peek into the rooms without joining. This is being explored in +[#272](https://github.com/matrix-org/matrix-public-archive/pull/272). But peeking +doesn't work when the server doesn't know about the room already (this is commonly +referred to as federated peeking) which is why we have to fallback to joining the room +in any case. We could solve the federated peeking problem and avoid the join with +[MSC3266 room summaries](https://github.com/matrix-org/matrix-spec-proposals/pull/3266) +to check whether the room is `world_readable` even over federation. + +Guests are completely separate concept and controlled by the `m.room.guest_access` state +event in the room. Guest access is also a much different ask than read-only access since +guests can also send messages in the room which isn't always desirable. The archive bot +is read-only and does not send messages. + ## Technical details The main readme has a [technical overview](../README.md#technical-overview) of the From a79342f83c465cc74d4aa394bf9fc9b6240621df Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 28 Jun 2023 18:15:18 -0500 Subject: [PATCH 06/12] Prepare changelog with #277 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f0f926..c92533e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Prevent join event spam with stable `reason`, https://github.com/matrix-org/matrix-public-archive/pull/268 - Don't allow previewing `shared` history rooms, https://github.com/matrix-org/matrix-public-archive/pull/239 - Contributed by [@tulir](https://github.com/tulir) +- Update FAQ to explain `world_readable` only, https://github.com/matrix-org/matrix-public-archive/pull/277 Developer facing: From 0fc4421432ffa1ba61cdad63fe407d9464a01e14 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 28 Jun 2023 20:29:49 -0500 Subject: [PATCH 07/12] Indicate when the room was set to `world_readable` and by who (#278) --- server/lib/matrix-utils/fetch-room-data.js | 7 +++++++ server/routes/room-routes.js | 5 ++++- shared/hydrogen-vm-render-script.js | 1 + shared/viewmodels/ArchiveRoomViewModel.js | 5 +++++ shared/views/RightPanelContentView.js | 24 ++++++++++++++++++++-- 5 files changed, 39 insertions(+), 3 deletions(-) diff --git a/server/lib/matrix-utils/fetch-room-data.js b/server/lib/matrix-utils/fetch-room-data.js index e1ea483..eb39c40 100644 --- a/server/lib/matrix-utils/fetch-room-data.js +++ b/server/lib/matrix-utils/fetch-room-data.js @@ -210,9 +210,15 @@ const fetchRoomData = traceFunction(async function ( } let historyVisibility; + let historyVisibilityEventMeta; if (stateHistoryVisibilityResDataOutcome.reason === undefined) { const { data } = stateHistoryVisibilityResDataOutcome.value; historyVisibility = data?.content?.history_visibility; + historyVisibilityEventMeta = { + historyVisibility, + sender: data?.sender, + originServerTs: data?.origin_server_ts, + }; } let roomCreationTs; @@ -240,6 +246,7 @@ const fetchRoomData = traceFunction(async function ( canonicalAlias, avatarUrl, historyVisibility, + historyVisibilityEventMeta, roomCreationTs, predecessorRoomId, predecessorLastKnownEventId, diff --git a/server/routes/room-routes.js b/server/routes/room-routes.js index 7417ac9..a168b8e 100644 --- a/server/routes/room-routes.js +++ b/server/routes/room-routes.js @@ -833,7 +833,10 @@ router.get( if (!allowedToViewRoom) { throw new StatusError( 403, - `Only \`world_readable\` rooms can be viewed in the archive. ${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility}` + `Only \`world_readable\` rooms can be viewed in the archive. ` + + `${roomData.id} has m.room.history_visiblity=${roomData.historyVisibility} ` + + `(set by ${roomData.historyVisibilityEventMeta?.sender} on ` + + `${new Date(roomData.historyVisibilityEventMeta?.originServerTs).toISOString()})` ); } diff --git a/shared/hydrogen-vm-render-script.js b/shared/hydrogen-vm-render-script.js index 97a378c..4ef5cbe 100644 --- a/shared/hydrogen-vm-render-script.js +++ b/shared/hydrogen-vm-render-script.js @@ -118,6 +118,7 @@ async function mountHydrogen() { events, stateEventMap, shouldIndex, + historyVisibilityEventMeta: roomData.historyVisibilityEventMeta, basePath: config.basePath, }); diff --git a/shared/viewmodels/ArchiveRoomViewModel.js b/shared/viewmodels/ArchiveRoomViewModel.js index c38986f..3ae2f7e 100644 --- a/shared/viewmodels/ArchiveRoomViewModel.js +++ b/shared/viewmodels/ArchiveRoomViewModel.js @@ -75,6 +75,7 @@ class ArchiveRoomViewModel extends ViewModel { events, stateEventMap, shouldIndex, + historyVisibilityEventMeta, basePath, } = options; assert(homeserverUrl); @@ -85,6 +86,9 @@ class ArchiveRoomViewModel extends ViewModel { assert(events); assert(stateEventMap); assert(shouldIndex !== undefined); + assert(historyVisibilityEventMeta.historyVisibility); + assert(historyVisibilityEventMeta.sender); + assert(historyVisibilityEventMeta.originServerTs); assert(events); this._room = room; @@ -213,6 +217,7 @@ class ArchiveRoomViewModel extends ViewModel { shouldShowTimeSelector, timeSelectorViewModel: this._timeSelectorViewModel, shouldIndex, + historyVisibilityEventMeta, get developerOptionsUrl() { return urlRouter.urlForSegments([ navigation.segment('room', room.id), diff --git a/shared/views/RightPanelContentView.js b/shared/views/RightPanelContentView.js index fbb0bc5..3ebab71 100644 --- a/shared/views/RightPanelContentView.js +++ b/shared/views/RightPanelContentView.js @@ -10,12 +10,28 @@ class RightPanelContentView extends TemplateView { render(t, vm) { assert(vm.shouldIndex !== undefined); assert(vm.shouldShowTimeSelector !== undefined); + assert(vm.historyVisibilityEventMeta.historyVisibility); + assert(vm.historyVisibilityEventMeta.sender); + assert(vm.historyVisibilityEventMeta.originServerTs); let maybeIndexedMessage = 'This room is not being indexed by search engines '; if (vm.shouldIndex) { - maybeIndexedMessage = 'This room is being indexed by search engines '; + maybeIndexedMessage = 'This room is being indexed by search engines'; } + const historyVisibilitySender = vm.historyVisibilityEventMeta.sender; + + let historyVisibilityDisplayValue = vm.historyVisibilityEventMeta.historyVisibility; + if (vm.historyVisibilityEventMeta.historyVisibility === 'world_readable') { + historyVisibilityDisplayValue = 'world readable'; + } + + const [historyVisibilitySetDatePiece, _timePiece] = new Date( + vm.historyVisibilityEventMeta.originServerTs + ) + .toISOString() + .split('T'); + return t.div( { className: 'RightPanelContentView', @@ -33,9 +49,13 @@ class RightPanelContentView extends TemplateView { className: 'RightPanelContentView_footer', }, [ + t.p([ + `This room is accessible in the archive because it was set to ` + + `${historyVisibilityDisplayValue} by ${historyVisibilitySender} on ${historyVisibilitySetDatePiece}.`, + ]), t.p([ maybeIndexedMessage, - '(', + ' (', t.a( { className: 'external-link RightPanelContentView_footerLink', From 5de8cb4e3551dbba63f9c408ddbb165898efff5d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 28 Jun 2023 20:30:31 -0500 Subject: [PATCH 08/12] Prepare changelog with #278 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c92533e..76adc26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Don't allow previewing `shared` history rooms, https://github.com/matrix-org/matrix-public-archive/pull/239 - Contributed by [@tulir](https://github.com/tulir) - Update FAQ to explain `world_readable` only, https://github.com/matrix-org/matrix-public-archive/pull/277 +- Indicate when the room was set to `world_readable` and by who, https://github.com/matrix-org/matrix-public-archive/pull/278 Developer facing: From 59c9d3180e6a7e541003bec50073dc8f08880d75 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 29 Jun 2023 18:58:53 -0500 Subject: [PATCH 09/12] Fix `18+` false positives with NSFW check (#279) Was noticing false positives with our test room names like: `planet-1688081266353-room-18` Before: ```regex /(\b|_)18+(\b|_)/i ``` After: ```regex /(\b|_|-|\s|^)18\+(\b|_|-|\s|$)/i ``` --- shared/lib/check-text-for-nsfw.js | 8 +++++++- test/shared/lib/check-text-for-nsfw-tests.js | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/shared/lib/check-text-for-nsfw.js b/shared/lib/check-text-for-nsfw.js index 445e557..1c20820 100644 --- a/shared/lib/check-text-for-nsfw.js +++ b/shared/lib/check-text-for-nsfw.js @@ -1,7 +1,13 @@ 'use strict'; +const escapeStringRegexp = require('escape-string-regexp'); + const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+']; -const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`(\\b|_)${word}(\\b|_)`, 'i')); +const NSFW_REGEXES = NSFW_WORDS.map( + // We use `(\b|_|-|\s|^)` instead of just `(\b|_)` because the word boundary doesn't + // match next to the `+` sign in `18+` + (word) => new RegExp(`(\\b|_|-|\\s|^)${escapeStringRegexp(word)}(\\b|_|-|\\s|$)`, 'i') +); // A very basic check for NSFW content that just looks for some keywords in the given // text diff --git a/test/shared/lib/check-text-for-nsfw-tests.js b/test/shared/lib/check-text-for-nsfw-tests.js index 9a19179..3d7ba04 100644 --- a/test/shared/lib/check-text-for-nsfw-tests.js +++ b/test/shared/lib/check-text-for-nsfw-tests.js @@ -13,6 +13,11 @@ describe('checkTextForNsfw', () => { NSFW_foo: true, 'NSFW-foo': true, 'NSFW:foo': true, + '18+ only': true, + // Previous false positives that we ran into in the wild that should not be flagged + // as NSFW + '1888-great-blizzard': false, + 'argon-18-element': false, }).forEach(([inputText, expectedNsfw]) => { it(`should return ${expectedNsfw} for '${inputText}'`, () => { assert.strictEqual( From a26b852c5af42625b4cbce667627d832bea10a1a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 29 Jun 2023 18:59:54 -0500 Subject: [PATCH 10/12] Prepare changelog with #279 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 76adc26..dae1607 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Prevent Cloudflare from overriding our own 504 timeout page, https://github.com/matrix-org/matrix-public-archive/pull/228 - Catch NSFW rooms with underscores, https://github.com/matrix-org/matrix-public-archive/pull/231 +- Fix `18+` false positives with NSFW check, https://github.com/matrix-org/matrix-public-archive/pull/279 - Fix room cards sorting in the wrong direction on Firefox, https://github.com/matrix-org/matrix-public-archive/pull/261 - Remove `libera.chat` as a default since their rooms are not accessible in the archive, https://github.com/matrix-org/matrix-public-archive/pull/263 - Add reason why the archive bot is joining the room, https://github.com/matrix-org/matrix-public-archive/pull/262 From dd2cd9126da44f2701917e4ba7e5e10bb34856f8 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 30 Jun 2023 03:08:32 -0500 Subject: [PATCH 11/12] Only show `world_readable` rooms in the room directory (#276) Happens to address part of https://github.com/matrix-org/matrix-public-archive/issues/271 but made primarily as a follow-up to https://github.com/matrix-org/matrix-public-archive/pull/239 --- Only 42% rooms on the `matrix.org` room directory are `world_readable` which means we will get pages of rooms that are half-empty most of the time if we just naively fetch 9 rooms at a time. Ideally, we would be able to just add a filter directly to `/publicRooms` in order to only grab the `world_readable` rooms and still get full pages but the filter option doesn't allow us to slice by `world_readable` history visibility. Instead, we have to paginate until we get a full grid of 9 rooms, then make a final `/publicRooms` request to backtrack to the exact continuation point so next page won't skip any rooms in between. --- We had empty spaces in the grid before because some rooms in the room directory are private which we filtered out before. But that was a much more rare experience since only 2% of rooms were private . --- .../matrix-utils/fetch-accessible-rooms.js | 191 ++++++++++++++ server/lib/matrix-utils/fetch-public-rooms.js | 57 ----- server/routes/room-directory-routes.js | 18 +- shared/lib/url-creator.js | 19 +- shared/viewmodels/RoomDirectoryViewModel.js | 3 + shared/views/RoomDirectoryView.js | 15 +- test/dockerfiles/Synapse.Dockerfile | 7 +- test/e2e-tests.js | 232 +++++++++++++++++- test/test-utils/client-utils.js | 51 +++- 9 files changed, 515 insertions(+), 78 deletions(-) create mode 100644 server/lib/matrix-utils/fetch-accessible-rooms.js delete mode 100644 server/lib/matrix-utils/fetch-public-rooms.js diff --git a/server/lib/matrix-utils/fetch-accessible-rooms.js b/server/lib/matrix-utils/fetch-accessible-rooms.js new file mode 100644 index 0000000..cc24bd7 --- /dev/null +++ b/server/lib/matrix-utils/fetch-accessible-rooms.js @@ -0,0 +1,191 @@ +'use strict'; + +const assert = require('assert'); + +const urlJoin = require('url-join'); +const { DIRECTION } = require('matrix-public-archive-shared/lib/reference-values'); +const { fetchEndpointAsJson } = require('../fetch-endpoint'); +const { traceFunction } = require('../../tracing/trace-utilities'); + +const config = require('../config'); +const matrixServerUrl = config.get('matrixServerUrl'); +assert(matrixServerUrl); + +// The number of requests we should make to try to fill the limit before bailing out +const NUM_MAX_REQUESTS = 10; + +async function requestPublicRooms( + accessToken, + { server, searchTerm, paginationToken, limit, abortSignal } = {} +) { + let qs = new URLSearchParams(); + if (server) { + qs.append('server', server); + } + + const publicRoomsEndpoint = urlJoin( + matrixServerUrl, + `_matrix/client/v3/publicRooms?${qs.toString()}` + ); + + const { data: publicRoomsRes } = await fetchEndpointAsJson(publicRoomsEndpoint, { + method: 'POST', + body: { + include_all_networks: true, + filter: { + generic_search_term: searchTerm, + }, + since: paginationToken, + limit, + }, + accessToken, + abortSignal, + }); + + return publicRoomsRes; +} + +// eslint-disable-next-line complexity, max-statements +async function fetchAccessibleRooms( + accessToken, + { + server, + searchTerm, + // Direction is baked into the pagination token but we're unable to decipher it from + // the opaque token, we also have to pass it in explicitly. + paginationToken, + direction = DIRECTION.forward, + limit, + abortSignal, + } = {} +) { + assert(accessToken); + assert([DIRECTION.forward, DIRECTION.backward].includes(direction), 'direction must be [f|b]'); + + // Based off of the matrix.org room directory, only 42% of rooms are world_readable, + // which means our best bet to fill up the results to the limit is to request at least + // 2.4 times as many. I've doubled and rounded it up to 5 times as many so we can have + // less round-trips. + const bulkPaginationLimit = Math.ceil(5 * limit); + + let accessibleRooms = []; + + let firstResponse; + let lastResponse; + + let loopToken = paginationToken; + let lastLoopToken; + let continuationIndex; + let currentRequestCount = 0; + while ( + // Stop if we have reached the limit of rooms we want to fetch + accessibleRooms.length < limit && + // And bail if we're already gone through a bunch of pages to try to fill the limit + currentRequestCount < NUM_MAX_REQUESTS && + // And bail if we've reached the end of the pagination + // Always do the first request + (currentRequestCount === 0 || + // If we have a next token, we can do another request + (currentRequestCount > 0 && loopToken)) + ) { + const publicRoomsRes = await requestPublicRooms(accessToken, { + server, + searchTerm, + paginationToken: loopToken, + limit: bulkPaginationLimit, + abortSignal, + }); + lastLoopToken = loopToken; + lastResponse = publicRoomsRes; + + if (currentRequestCount === 0) { + firstResponse = publicRoomsRes; + } + + // Get the token ready for the next loop + loopToken = + direction === DIRECTION.forward ? publicRoomsRes.next_batch : publicRoomsRes.prev_batch; + + const fetchedRooms = publicRoomsRes.chunk; + const fetchedRoomsInDirection = + direction === DIRECTION.forward ? fetchedRooms : fetchedRooms.reverse(); + + // We only want to see world_readable rooms in the archive + let index = 0; + for (let room of fetchedRoomsInDirection) { + if (room.world_readable) { + if (direction === DIRECTION.forward) { + accessibleRooms.push(room); + } else if (direction === DIRECTION.backward) { + accessibleRooms.unshift(room); + } else { + throw new Error(`Invalid direction: ${direction}`); + } + } + + if (accessibleRooms.length === limit && !continuationIndex) { + continuationIndex = index; + } + + // Stop after we've reached the limit + if (accessibleRooms.length >= limit) { + break; + } + + index += 1; + } + + currentRequestCount += 1; + } + + // Back-track to get the perfect continuation point and show exactly the limit of + // rooms in the grid. + // + // Alternatively, we could just not worry about and show more than the limit of rooms + // + // XXX: Since the room directory order is not stable, this is slightly flawed as the + // results could have shifted slightly from when we made the last request to now but + // we assume it's good enough. + let nextPaginationToken; + let prevPaginationToken; + if (continuationIndex) { + const publicRoomsRes = await requestPublicRooms(accessToken, { + server, + searchTerm, + // Start from the last request + paginationToken: lastLoopToken, + // Then only go out as far out as the continuation index (the point when we filled + // the limit) + limit: continuationIndex + 1, + abortSignal, + }); + + if (direction === DIRECTION.forward) { + prevPaginationToken = firstResponse.prev_batch; + nextPaginationToken = publicRoomsRes.next_batch; + } else if (direction === DIRECTION.backward) { + prevPaginationToken = publicRoomsRes.prev_batch; + nextPaginationToken = firstResponse.next_batch; + } else { + throw new Error(`Invalid direction: ${direction}`); + } + } else { + if (direction === DIRECTION.forward) { + prevPaginationToken = firstResponse.prev_batch; + nextPaginationToken = lastResponse.next_batch; + } else if (direction === DIRECTION.backward) { + prevPaginationToken = lastResponse.prev_batch; + nextPaginationToken = firstResponse.next_batch; + } else { + throw new Error(`Invalid direction: ${direction}`); + } + } + + return { + rooms: accessibleRooms, + prevPaginationToken, + nextPaginationToken, + }; +} + +module.exports = traceFunction(fetchAccessibleRooms); diff --git a/server/lib/matrix-utils/fetch-public-rooms.js b/server/lib/matrix-utils/fetch-public-rooms.js deleted file mode 100644 index bc9e7c5..0000000 --- a/server/lib/matrix-utils/fetch-public-rooms.js +++ /dev/null @@ -1,57 +0,0 @@ -'use strict'; - -const assert = require('assert'); - -const urlJoin = require('url-join'); -const { fetchEndpointAsJson } = require('../fetch-endpoint'); -const { traceFunction } = require('../../tracing/trace-utilities'); - -const config = require('../config'); -const matrixServerUrl = config.get('matrixServerUrl'); -assert(matrixServerUrl); - -async function fetchPublicRooms( - accessToken, - { server, searchTerm, paginationToken, limit, abortSignal } = {} -) { - assert(accessToken); - - let qs = new URLSearchParams(); - if (server) { - qs.append('server', server); - } - - const publicRoomsEndpoint = urlJoin( - matrixServerUrl, - `_matrix/client/v3/publicRooms?${qs.toString()}` - ); - - const { data: publicRoomsRes } = await fetchEndpointAsJson(publicRoomsEndpoint, { - method: 'POST', - body: { - include_all_networks: true, - filter: { - generic_search_term: searchTerm, - }, - since: paginationToken, - limit, - }, - accessToken, - abortSignal, - }); - - // We only want to see public rooms in the archive - const accessibleRooms = publicRoomsRes.chunk.filter((room) => { - // `room.world_readable` is also accessible here but we only use history - // `world_readable` to determine search indexing. - return room.join_rule === 'public'; - }); - - return { - rooms: accessibleRooms, - nextPaginationToken: publicRoomsRes.next_batch, - prevPaginationToken: publicRoomsRes.prev_batch, - }; -} - -module.exports = traceFunction(fetchPublicRooms); diff --git a/server/routes/room-directory-routes.js b/server/routes/room-directory-routes.js index ad87681..fde1467 100644 --- a/server/routes/room-directory-routes.js +++ b/server/routes/room-directory-routes.js @@ -6,10 +6,11 @@ const urlJoin = require('url-join'); const express = require('express'); const asyncHandler = require('../lib/express-async-handler'); +const { DIRECTION } = require('matrix-public-archive-shared/lib/reference-values'); const RouteTimeoutAbortError = require('../lib/errors/route-timeout-abort-error'); const UserClosedConnectionAbortError = require('../lib/errors/user-closed-connection-abort-error'); const identifyRoute = require('../middleware/identify-route-middleware'); -const fetchPublicRooms = require('../lib/matrix-utils/fetch-public-rooms'); +const fetchAccessibleRooms = require('../lib/matrix-utils/fetch-accessible-rooms'); const renderHydrogenVmRenderScriptToPageHtml = require('../hydrogen-render/render-hydrogen-vm-render-script-to-page-html'); const setHeadersToPreloadAssets = require('../lib/set-headers-to-preload-assets'); @@ -33,9 +34,19 @@ router.get( '/', identifyRoute('app-room-directory-index'), asyncHandler(async function (req, res) { - const paginationToken = req.query.page; const searchTerm = req.query.search; const homeserver = req.query.homeserver; + const paginationToken = req.query.page; + const direction = req.query.dir; + + // You must provide both `paginationToken` and `direction` if either is defined + if (paginationToken || direction) { + assert( + [DIRECTION.forward, DIRECTION.backward].includes(direction), + '?dir query parameter must be [f|b]' + ); + assert(paginationToken, '?page query parameter must be defined if ?dir is defined'); + } // It would be good to grab more rooms than we display in case we need // to filter any out but then the pagination tokens with the homeserver @@ -48,12 +59,13 @@ router.get( let prevPaginationToken; let roomFetchError; try { - ({ rooms, nextPaginationToken, prevPaginationToken } = await fetchPublicRooms( + ({ rooms, nextPaginationToken, prevPaginationToken } = await fetchAccessibleRooms( matrixAccessToken, { server: homeserver, searchTerm, paginationToken, + direction, limit, abortSignal: req.abortSignal, } diff --git a/shared/lib/url-creator.js b/shared/lib/url-creator.js index 4285d21..f8b9b3d 100644 --- a/shared/lib/url-creator.js +++ b/shared/lib/url-creator.js @@ -3,7 +3,10 @@ const urlJoin = require('url-join'); const assert = require('matrix-public-archive-shared/lib/assert'); -const { TIME_PRECISION_VALUES } = require('matrix-public-archive-shared/lib/reference-values'); +const { + DIRECTION, + TIME_PRECISION_VALUES, +} = require('matrix-public-archive-shared/lib/reference-values'); function qsToUrlPiece(qs) { if (qs.toString()) { @@ -25,7 +28,16 @@ class URLCreator { return `https://matrix.to/#/${roomIdOrAlias}`; } - roomDirectoryUrl({ searchTerm, homeserver, paginationToken } = {}) { + roomDirectoryUrl({ searchTerm, homeserver, paginationToken, direction } = {}) { + // You must provide both `paginationToken` and `direction` if either is defined + if (paginationToken || direction) { + assert( + [DIRECTION.forward, DIRECTION.backward].includes(direction), + 'direction must be [f|b]' + ); + assert(paginationToken); + } + let qs = new URLSearchParams(); if (searchTerm) { qs.append('search', searchTerm); @@ -36,6 +48,9 @@ class URLCreator { if (paginationToken) { qs.append('page', paginationToken); } + if (direction) { + qs.append('dir', direction); + } return `${this._basePath}${qsToUrlPiece(qs)}`; } diff --git a/shared/viewmodels/RoomDirectoryViewModel.js b/shared/viewmodels/RoomDirectoryViewModel.js index 90c5c13..3268b6b 100644 --- a/shared/viewmodels/RoomDirectoryViewModel.js +++ b/shared/viewmodels/RoomDirectoryViewModel.js @@ -9,6 +9,7 @@ const ModalViewModel = require('matrix-public-archive-shared/viewmodels/ModalVie const HomeserverSelectionModalContentViewModel = require('matrix-public-archive-shared/viewmodels/HomeserverSelectionModalContentViewModel'); const RoomCardViewModel = require('matrix-public-archive-shared/viewmodels/RoomCardViewModel'); const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw'); +const { DIRECTION } = require('../lib/reference-values'); const DEFAULT_SERVER_LIST = ['matrix.org', 'gitter.im']; @@ -304,6 +305,7 @@ class RoomDirectoryViewModel extends ViewModel { homeserver: this.homeserverSelection, searchTerm: this.searchTerm, paginationToken: this._nextPaginationToken, + direction: DIRECTION.forward, }); } @@ -316,6 +318,7 @@ class RoomDirectoryViewModel extends ViewModel { homeserver: this.homeserverSelection, searchTerm: this.searchTerm, paginationToken: this._prevPaginationToken, + direction: DIRECTION.backward, }); } diff --git a/shared/views/RoomDirectoryView.js b/shared/views/RoomDirectoryView.js index 3466322..61157d1 100644 --- a/shared/views/RoomDirectoryView.js +++ b/shared/views/RoomDirectoryView.js @@ -322,10 +322,21 @@ class RoomDirectoryView extends TemplateView { t.view(roomList), t.div({ className: 'RoomDirectoryView_paginationButtonCombo' }, [ t.a( - { className: 'RoomDirectoryView_paginationButton', href: vm.prevPageUrl }, + { + className: 'RoomDirectoryView_paginationButton', + href: vm.prevPageUrl, + 'data-testid': 'room-directory-prev-link', + }, 'Previous' ), - t.a({ className: 'RoomDirectoryView_paginationButton', href: vm.nextPageUrl }, 'Next'), + t.a( + { + className: 'RoomDirectoryView_paginationButton', + href: vm.nextPageUrl, + 'data-testid': 'room-directory-next-link', + }, + 'Next' + ), ]), ]), t.if( diff --git a/test/dockerfiles/Synapse.Dockerfile b/test/dockerfiles/Synapse.Dockerfile index 90407f3..dbfbf1c 100644 --- a/test/dockerfiles/Synapse.Dockerfile +++ b/test/dockerfiles/Synapse.Dockerfile @@ -3,12 +3,7 @@ # # Currently this is based on Complement Synapse images which are based on the # published 'synapse:latest' image -- ie, the most recent Synapse release. - -# FIXME: We're pinning the version to `v1.79.0` until -# https://github.com/matrix-org/synapse/issues/15526 is fixed. Feel free to update back -# to `latest` once that issue is resolved. More context: -# https://github.com/matrix-org/matrix-public-archive/pull/208#discussion_r1183294630 -ARG SYNAPSE_VERSION=v1.79.0 +ARG SYNAPSE_VERSION=latest FROM matrixdotorg/synapse:${SYNAPSE_VERSION} diff --git a/test/e2e-tests.js b/test/e2e-tests.js index b95425c..2e26ece 100644 --- a/test/e2e-tests.js +++ b/test/e2e-tests.js @@ -38,6 +38,7 @@ const { sendMessage, createMessagesInRoom, getMessagesInRoom, + waitForResultsInHomeserverRoomDirectory, updateProfile, uploadContent, } = require('./test-utils/client-utils'); @@ -2507,15 +2508,33 @@ describe('matrix-public-archive', () => { // test runs against the same homeserver const timeToken = Date.now(); const roomPlanetPrefix = `planet-${timeToken}`; + const roomSaturnName = `${roomPlanetPrefix}-saturn`; const roomSaturnId = await createTestRoom(client, { - name: `${roomPlanetPrefix}-saturn`, + name: roomSaturnName, }); + const roomMarsName = `${roomPlanetPrefix}-mars`; const roomMarsId = await createTestRoom(client, { - name: `${roomPlanetPrefix}-mars`, + name: roomMarsName, }); // Browse the room directory without search to see many rooms + // + // (we set this here in case we timeout while waiting for the test rooms to + // appear in the room directory) archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl(); + + // Try to avoid flakey tests where the homeserver hasn't added the rooms to the + // room directory yet. This isn't completely robust as it doesn't check that the + // random room at the start is in the directory but should be good enough. + await waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm: roomSaturnName, + }); + await waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm: roomMarsName, + }); + const { data: roomDirectoryPageHtml } = await fetchEndpointAsText(archiveUrl); const dom = parseHTML(roomDirectoryPageHtml); @@ -2556,17 +2575,33 @@ describe('matrix-public-archive', () => { // test runs against the same homeserver const timeToken = Date.now(); const roomPlanetPrefix = `remote-planet-${timeToken}`; + const roomXName = `${roomPlanetPrefix}-x`; const roomXId = await createTestRoom(hs2Client, { - name: `${roomPlanetPrefix}-x`, + name: roomXName, }); + const roomYname = `${roomPlanetPrefix}-y`; const roomYId = await createTestRoom(hs2Client, { - name: `${roomPlanetPrefix}-y`, + name: roomYname, }); + // (we set this here in case we timeout while waiting for the test rooms to + // appear in the room directory) archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl({ homeserver: HOMESERVER_URL_TO_PRETTY_NAME_MAP[testMatrixServerUrl2], searchTerm: roomPlanetPrefix, }); + + // Try to avoid flakey tests where the homeserver hasn't added the rooms to the + // room directory yet. + await waitForResultsInHomeserverRoomDirectory({ + client: hs2Client, + searchTerm: roomXName, + }); + await waitForResultsInHomeserverRoomDirectory({ + client: hs2Client, + searchTerm: roomYname, + }); + const { data: roomDirectoryWithSearchPageHtml } = await fetchEndpointAsText(archiveUrl); const domWithSearch = parseHTML(roomDirectoryWithSearchPageHtml); @@ -2601,21 +2636,39 @@ describe('matrix-public-archive', () => { // test runs against the same homeserver const timeToken = Date.now(); const roomPlanetPrefix = `planet-${timeToken}`; + const roomUranusName = `${roomPlanetPrefix}-uranus-nsfw`; const roomUranusId = await createTestRoom(client, { // NSFW in title - name: `${roomPlanetPrefix}-uranus-nsfw`, + name: roomUranusName, }); + const roomMarsName = `${roomPlanetPrefix}-mars`; const roomMarsId = await createTestRoom(client, { - name: `${roomPlanetPrefix}-mars`, + name: roomMarsName, // NSFW in room topic/description topic: 'Get your ass to mars (NSFW)', }); // Browse the room directory searching the room directory for those NSFW rooms // (narrowing down results). + // + // (we set this here in case we timeout while waiting for the test rooms to + // appear in the room directory) archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl({ searchTerm: roomPlanetPrefix, }); + + // Try to avoid flakey tests where the homeserver hasn't added the rooms to the + // room directory yet. This isn't completely robust as it doesn't check that the + // random room at the start is in the directory but should be good enough. + await waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm: roomUranusName, + }); + await waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm: roomMarsName, + }); + const { data: roomDirectoryWithSearchPageHtml } = await fetchEndpointAsText(archiveUrl); const domWithSearch = parseHTML(roomDirectoryWithSearchPageHtml); @@ -2644,6 +2697,173 @@ describe('matrix-public-archive', () => { ); }); }); + + it('pagination is seamless', async () => { + const client = await getTestClientForHs(testMatrixServerUrl1); + // We use a `timeToken` so that we can namespace these rooms away from other + // test runs against the same homeserver + const timeToken = Date.now(); + const roomPlanetPrefix = `planet-${timeToken}`; + + // Fill up the room room directory with multiple pages of rooms + const visibleRoomConfigurations = []; + const roomsConfigurationsToCreate = []; + for (let i = 0; i < 40; i++) { + const roomCreateOptions = { + name: `${roomPlanetPrefix}-room-${i}`, + }; + + // Sprinkle in some rooms every so often that should not appear in the room directory + if (i % 3 === 0) { + roomCreateOptions.name = `${roomPlanetPrefix}-room-not-world-readable-${i}`; + roomCreateOptions.initial_state = [ + { + type: 'm.room.history_visibility', + state_key: '', + content: { + history_visibility: 'joined', + }, + }, + { + type: 'm.room.topic', + state_key: '', + content: { + // Just a specific token we can search for in the DOM to make sure + // this room does not appear in the room directory. + topic: 'should-not-be-visible-in-archive-room-directory', + }, + }, + ]; + } else { + visibleRoomConfigurations.push(roomCreateOptions); + } + + roomsConfigurationsToCreate.push(roomCreateOptions); + } + + // Doing all of these create room requests in parallel is about 2x faster than + // doing them serially and the room directory doesn't return the rooms in any + // particular order so it doesn't make the test any more clear doing them + // serially anyway. + const createdRoomsIds = await Promise.all( + roomsConfigurationsToCreate.map((roomCreateOptions) => + createTestRoom(client, roomCreateOptions) + ) + ); + + function roomIdToRoomName(expectedRoomId) { + const roomIndex = createdRoomsIds.findIndex((roomId) => { + return roomId === expectedRoomId; + }); + assert( + roomIndex > 0, + `Expected to find expectedRoomId=${expectedRoomId} in the list of created rooms createdRoomsIds=${createdRoomsIds}` + ); + + const roomConfig = roomsConfigurationsToCreate[roomIndex]; + assert( + roomConfig, + `Expected to find room config for roomIndex=${roomIndex} in the list of roomsConfigurationsToCreate (length ${roomsConfigurationsToCreate.length})}` + ); + + return roomConfig.name; + } + + async function checkRoomsOnPage(archiveUrl) { + const { data: roomDirectoryWithSearchPageHtml } = await fetchEndpointAsText(archiveUrl); + const dom = parseHTML(roomDirectoryWithSearchPageHtml); + + const roomsCardsOnPageWithSearch = [ + ...dom.document.querySelectorAll(`[data-testid="room-card"]`), + ]; + + const roomsIdsOnPage = roomsCardsOnPageWithSearch.map((roomCardEl) => { + return roomCardEl.getAttribute('data-room-id'); + }); + + // Sanity check that we don't see any non-world_readable rooms. + roomsCardsOnPageWithSearch.forEach((roomCardEl) => { + assert.match( + roomCardEl.innerHTML, + /^((?!should-not-be-visible-in-archive-room-directory).)*$/, + `Expected not to see any non-world_readable rooms on the page but saw ${roomCardEl.getAttribute( + 'data-room-id' + )} which has "should-not-be-visible-in-archive-room-directory" in the room topic` + ); + }); + + // Find the pagination buttons and grab the links to the previous and next pages + const previousLinkElement = dom.document.querySelector( + `[data-testid="room-directory-prev-link"]` + ); + const nextLinkElement = dom.document.querySelector( + `[data-testid="room-directory-next-link"]` + ); + + const previousPaginationLink = previousLinkElement.getAttribute('href'); + const nextPaginationLink = nextLinkElement.getAttribute('href'); + + return { + archiveUrl, + roomsIdsOnPage, + previousPaginationLink, + nextPaginationLink, + }; + } + + // Browse the room directory with the search prefix so we only see rooms + // relevant to this test. + // + // (we set this here in case we timeout while waiting for the test rooms to + // appear in the room directory) + archiveUrl = matrixPublicArchiveURLCreator.roomDirectoryUrl({ + searchTerm: roomPlanetPrefix, + }); + + // Try to avoid flakey tests where the homeserver hasn't added the rooms + // to the room directory yet. This isn't completely robust as it doesn't check + // that all rooms are visible but it's better than nothing. + await waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm: visibleRoomConfigurations[0].name, + }); + await waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm: visibleRoomConfigurations[visibleRoomConfigurations.length - 1].name, + }); + + // Visit a sequence of pages using the pagination links: 1 -> 2 -> 3 -> 2 -> 1 + const firstPage = await checkRoomsOnPage(archiveUrl); + const secondPage = await checkRoomsOnPage(firstPage.nextPaginationLink); + const thirdPage = await checkRoomsOnPage(secondPage.nextPaginationLink); + const backtrackSecondPage = await checkRoomsOnPage(thirdPage.previousPaginationLink); + const backtrackFirstPage = await checkRoomsOnPage( + backtrackSecondPage.previousPaginationLink + ); + + // Ensure that we saw all of the visible rooms paginating through the directory + assert.deepStrictEqual( + [...firstPage.roomsIdsOnPage, ...secondPage.roomsIdsOnPage, ...thirdPage.roomsIdsOnPage] + .map(roomIdToRoomName) + .sort(), + visibleRoomConfigurations.map((roomConfig) => roomConfig.name).sort(), + 'Make sure we saw all visible rooms paginating through the directory' + ); + + // Ensure that we see the same rooms in the same order going backward that we saw going forward + archiveUrl = backtrackSecondPage.archiveUrl; + assert.deepStrictEqual( + backtrackSecondPage.roomsIdsOnPage.map(roomIdToRoomName), + secondPage.roomsIdsOnPage.map(roomIdToRoomName), + 'From the third page, going backward to second page should show the same rooms that we saw on the second page when going forward' + ); + archiveUrl = backtrackFirstPage.archiveUrl; + assert.deepStrictEqual( + backtrackFirstPage.roomsIdsOnPage.map(roomIdToRoomName), + firstPage.roomsIdsOnPage.map(roomIdToRoomName), + 'From the second page, going backward to first page should show the same rooms that we saw on first page when going forward' + ); + }); }); describe('access controls', () => { diff --git a/test/test-utils/client-utils.js b/test/test-utils/client-utils.js index e101618..f8d7ef3 100644 --- a/test/test-utils/client-utils.js +++ b/test/test-utils/client-utils.js @@ -4,6 +4,8 @@ const assert = require('assert'); const urlJoin = require('url-join'); const { fetchEndpointAsJson, fetchEndpoint } = require('../../server/lib/fetch-endpoint'); const getServerNameFromMatrixRoomIdOrAlias = require('../../server/lib/matrix-utils/get-server-name-from-matrix-room-id-or-alias'); +const { MS_LOOKUP } = require('matrix-public-archive-shared/lib/reference-values'); +const { ONE_SECOND_IN_MS } = MS_LOOKUP; const config = require('../../server/lib/config'); const matrixAccessToken = config.get('matrixAccessToken'); @@ -14,7 +16,7 @@ assert(testMatrixServerUrl1); let txnCount = 0; function getTxnId() { txnCount++; - return `${new Date().getTime()}--${txnCount}`; + return `txn${txnCount}-${new Date().getTime()}`; } // Basic slugify function, plenty of edge cases and should not be used for @@ -150,7 +152,7 @@ async function createTestRoom(client, overrideCreateOptions = {}) { } const roomName = overrideCreateOptions.name || 'the hangout spot'; - const roomAlias = slugify(roomName + getTxnId()); + const roomAlias = slugify(roomName + '-' + getTxnId()); const { data: createRoomResponse } = await fetchEndpointAsJson( urlJoin(client.homeserverUrl, `/_matrix/client/v3/createRoom?${qs.toString()}`), @@ -421,6 +423,50 @@ async function uploadContent({ client, roomId, data, fileName, contentType }) { return mxcUri; } +// This can be removed after https://github.com/matrix-org/synapse/issues/15526 is solved +async function waitForResultsInHomeserverRoomDirectory({ + client, + searchTerm, + timeoutMs = 10 * ONE_SECOND_IN_MS, +}) { + assert(client); + assert(searchTerm !== undefined); + + const roomDirectoryEndpoint = urlJoin(client.homeserverUrl, `_matrix/client/v3/publicRooms`); + + // eslint-disable-next-line no-async-promise-executor + await new Promise(async (resolve, reject) => { + try { + setTimeout(() => { + reject(new Error('Timed out waiting for rooms to appear in the room directory')); + }, timeoutMs); + + let foundResults = false; + while (!foundResults) { + const { data: publicRoomsRes } = await fetchEndpointAsJson(roomDirectoryEndpoint, { + method: 'POST', + body: { + include_all_networks: true, + filter: { + generic_search_term: searchTerm, + }, + limit: 1, + }, + accessToken: client.accessToken, + }); + + if (publicRoomsRes.chunk.length > 0) { + foundResults = true; + resolve(); + break; + } + } + } catch (err) { + reject(err); + } + }); +} + module.exports = { ensureUserRegistered, getTestClientForAs, @@ -433,6 +479,7 @@ module.exports = { sendMessage, createMessagesInRoom, getMessagesInRoom, + waitForResultsInHomeserverRoomDirectory, updateProfile, uploadContent, }; From 1d1d7d2d0d3935dbff17c24ea36c74346086434d Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 30 Jun 2023 03:09:26 -0500 Subject: [PATCH 12/12] Prepare changelog with #276 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index dae1607..5c83771 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - Contributed by [@tulir](https://github.com/tulir) - Update FAQ to explain `world_readable` only, https://github.com/matrix-org/matrix-public-archive/pull/277 - Indicate when the room was set to `world_readable` and by who, https://github.com/matrix-org/matrix-public-archive/pull/278 +- Only show `world_readable` rooms in the room directory, https://github.com/matrix-org/matrix-public-archive/pull/276 Developer facing: