From 198e8c09be78897b89fead9bfbf177af8fb11232 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 5 May 2023 15:36:26 -0500 Subject: [PATCH] Mark NSFW room pages with `` (#216) Related docs: - https://developers.google.com/search/docs/crawling-indexing/safesearch - https://developers.google.com/search/docs/crawling-indexing/special-tags --- server/hydrogen-render/render-page-html.js | 10 +++- server/lib/matrix-utils/fetch-room-data.js | 12 +++++ server/routes/room-routes.js | 8 ++++ shared/lib/check-text-for-nsfw.js | 14 ++++++ shared/viewmodels/RoomDirectoryViewModel.js | 8 +--- test/e2e-tests.js | 52 +++++++++++++++++++++ 6 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 shared/lib/check-text-for-nsfw.js diff --git a/server/hydrogen-render/render-page-html.js b/server/hydrogen-render/render-page-html.js index e03a58f..fced81d 100644 --- a/server/hydrogen-render/render-page-html.js +++ b/server/hydrogen-render/render-page-html.js @@ -35,7 +35,14 @@ function renderPageHtml({ // We shouldn't let some pages be indexed by search engines let maybeNoIndexHtml = ''; if (!pageOptions.shouldIndex) { - maybeNoIndexHtml = ``; + maybeNoIndexHtml = ``; + } + + // We should tell search engines that some pages are NSFW, see + // https://developers.google.com/search/docs/crawling-indexing/safesearch + let maybeAdultMeta = ''; + if (pageOptions.blockedBySafeSearch) { + maybeAdultMeta = ``; } const faviconMap = getFaviconAssetUrls(); @@ -45,6 +52,7 @@ function renderPageHtml({ ${maybeNoIndexHtml} + ${maybeAdultMeta} ${sanitizeHtml(`${pageOptions.title}`)} ${sanitizeHtml(``)} diff --git a/server/lib/matrix-utils/fetch-room-data.js b/server/lib/matrix-utils/fetch-room-data.js index 624b1c3..91c702e 100644 --- a/server/lib/matrix-utils/fetch-room-data.js +++ b/server/lib/matrix-utils/fetch-room-data.js @@ -151,6 +151,7 @@ const fetchRoomData = traceFunction(async function ( const [ stateNameResDataOutcome, + stateTopicResDataOutcome, stateCanonicalAliasResDataOutcome, stateAvatarResDataOutcome, stateHistoryVisibilityResDataOutcome, @@ -162,6 +163,10 @@ const fetchRoomData = traceFunction(async function ( accessToken: matrixAccessToken, abortSignal, }), + fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.topic'), { + accessToken: matrixAccessToken, + abortSignal, + }), fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.canonical_alias'), { accessToken: matrixAccessToken, abortSignal, @@ -197,6 +202,12 @@ const fetchRoomData = traceFunction(async function ( canonicalAlias = data?.content?.alias; } + let topic; + if (stateTopicResDataOutcome.reason === undefined) { + const { data } = stateTopicResDataOutcome.value; + topic = data?.content?.topic; + } + let avatarUrl; if (stateAvatarResDataOutcome.reason === undefined) { const { data } = stateAvatarResDataOutcome.value; @@ -236,6 +247,7 @@ const fetchRoomData = traceFunction(async function ( return { id: roomId, name, + topic, canonicalAlias, avatarUrl, historyVisibility, diff --git a/server/routes/room-routes.js b/server/routes/room-routes.js index ff22295..a0f3989 100644 --- a/server/routes/room-routes.js +++ b/server/routes/room-routes.js @@ -26,6 +26,7 @@ const renderHydrogenVmRenderScriptToPageHtml = require('../hydrogen-render/rende const setHeadersToPreloadAssets = require('../lib/set-headers-to-preload-assets'); const setHeadersForDateTemporalContext = require('../lib/set-headers-for-date-temporal-context'); const MatrixPublicArchiveURLCreator = require('matrix-public-archive-shared/lib/url-creator'); +const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw'); const { MS_LOOKUP, TIME_PRECISION_VALUES, @@ -896,9 +897,16 @@ router.get( shouldIndex = roomData?.historyVisibility === `world_readable`; } + const isNsfw = checkTextForNsfw( + // We concat the name, topic, etc together to simply do a single check against + // all of the text. + `${roomData.name} --- ${roomData.canonicalAlias} --- ${roomData.topic} ` + ); + const pageOptions = { title: `${roomData.name} - Matrix Public Archive`, description: `View the history of ${roomData.name} in the Matrix Public Archive`, + blockedBySafeSearch: isNsfw, entryPoint: 'client/js/entry-client-hydrogen.js', locationHref: urlJoin(basePath, req.originalUrl), shouldIndex, diff --git a/shared/lib/check-text-for-nsfw.js b/shared/lib/check-text-for-nsfw.js new file mode 100644 index 0000000..ef65518 --- /dev/null +++ b/shared/lib/check-text-for-nsfw.js @@ -0,0 +1,14 @@ +'use strict'; + +const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+']; +const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`\\b${word}\\b`, 'i')); + +// A very basic check for NSFW content that just looks for some keywords in the given +// text +function checkTextForNsfw(text) { + const isNsfw = NSFW_REGEXES.some((regex) => regex.test(text)); + + return isNsfw; +} + +module.exports = checkTextForNsfw; diff --git a/shared/viewmodels/RoomDirectoryViewModel.js b/shared/viewmodels/RoomDirectoryViewModel.js index 90742b8..1b28022 100644 --- a/shared/viewmodels/RoomDirectoryViewModel.js +++ b/shared/viewmodels/RoomDirectoryViewModel.js @@ -8,12 +8,10 @@ const LOCAL_STORAGE_KEYS = require('matrix-public-archive-shared/lib/local-stora const ModalViewModel = require('matrix-public-archive-shared/viewmodels/ModalViewModel'); const HomeserverSelectionModalContentViewModel = require('matrix-public-archive-shared/viewmodels/HomeserverSelectionModalContentViewModel'); const RoomCardViewModel = require('matrix-public-archive-shared/viewmodels/RoomCardViewModel'); +const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw'); const DEFAULT_SERVER_LIST = ['matrix.org', 'gitter.im', 'libera.chat']; -const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+']; -const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`\\b${word}\\b`, 'i')); - class RoomDirectoryViewModel extends ViewModel { constructor(options) { super(options); @@ -267,9 +265,7 @@ class RoomDirectoryViewModel extends ViewModel { this._roomCardViewModelsFilterMap.setApply((roomId, vm) => { // We concat the name, topic, etc together to simply do a single check against // all of the text. - const isNsfw = NSFW_REGEXES.some((regex) => - regex.test(vm.name + ' ---- ' + vm.canonicalAlias + ' --- ' + vm.topic) - ); + const isNsfw = checkTextForNsfw(vm.name + ' --- ' + vm.canonicalAlias + ' --- ' + vm.topic); vm.setBlockedBySafeSearch(isNsfw); }); } else { diff --git a/test/e2e-tests.js b/test/e2e-tests.js index 526f489..9f21fdf 100644 --- a/test/e2e-tests.js +++ b/test/e2e-tests.js @@ -633,6 +633,58 @@ describe('matrix-public-archive', () => { } }); + describe('safe search', () => { + [ + { + testName: 'nsfw words in title', + createRoomOptions: { + name: `uranus-nsfw`, + }, + }, + { + testName: 'nsfw words in topic', + createRoomOptions: { + name: `mars`, + topic: 'Get your ass to mars (NSFW)', + }, + }, + ].forEach((testCase) => { + it(`${testCase.testName} is correctly blocked/marked by safe search`, async () => { + const client = await getTestClientForHs(testMatrixServerUrl1); + const roomId = await createTestRoom(client, testCase.createRoomOptions); + + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForDate(roomId, archiveDate); + const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl); + const dom = parseHTML(archivePageHtml); + + // Make sure the `` tag exists on the page + // telling search engines that this is an adult page. + const metaElements = Array.from(dom.document.querySelectorAll('meta')); + assert.strictEqual( + dom.document.querySelector(`meta[name="rating"]`)?.getAttribute('content'), + 'adult', + `Unable to find on the page. We found these meta elements though:${metaElements + // eslint-disable-next-line max-nested-callbacks + .map((metaElement) => `\n \`${metaElement.outerHTML}\``) + .join('')}` + ); + }); + }); + + it('normal room is not blocked/marked by safe search', async () => { + const client = await getTestClientForHs(testMatrixServerUrl1); + const roomId = await createTestRoom(client); + + archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForDate(roomId, archiveDate); + const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl); + const dom = parseHTML(archivePageHtml); + + // Make sure the `` tag does NOT exist on the + // page telling search engines that this is an adult page. + assert.strictEqual(dom.document.querySelector(`meta[name="rating"]`), null); + }); + }); + describe('time selector', () => { it('shows time selector when there are too many messages from the same day', async () => { // Set this low so it's easy to hit the limit