Mark NSFW room pages with `<meta name="rating" content="adult">` (#216)

Related docs:

 - https://developers.google.com/search/docs/crawling-indexing/safesearch
 - https://developers.google.com/search/docs/crawling-indexing/special-tags
This commit is contained in:
Eric Eastwood 2023-05-05 15:36:26 -05:00 committed by GitHub
parent aeceb195e2
commit 198e8c09be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 97 additions and 7 deletions

View File

@ -35,7 +35,14 @@ function renderPageHtml({
// We shouldn't let some pages be indexed by search engines
let maybeNoIndexHtml = '';
if (!pageOptions.shouldIndex) {
maybeNoIndexHtml = `<meta name="robots" content="noindex, nofollow" />`;
maybeNoIndexHtml = `<meta name="robots" content="noindex, nofollow">`;
}
// We should tell search engines that some pages are NSFW, see
// https://developers.google.com/search/docs/crawling-indexing/safesearch
let maybeAdultMeta = '';
if (pageOptions.blockedBySafeSearch) {
maybeAdultMeta = `<meta name="rating" content="adult">`;
}
const faviconMap = getFaviconAssetUrls();
@ -45,6 +52,7 @@ function renderPageHtml({
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
${maybeNoIndexHtml}
${maybeAdultMeta}
${sanitizeHtml(`<title>${pageOptions.title}</title>`)}
${sanitizeHtml(`<meta name="description" content="${pageOptions.description}">`)}
<link rel="icon" href="${faviconMap.ico}" sizes="any">

View File

@ -151,6 +151,7 @@ const fetchRoomData = traceFunction(async function (
const [
stateNameResDataOutcome,
stateTopicResDataOutcome,
stateCanonicalAliasResDataOutcome,
stateAvatarResDataOutcome,
stateHistoryVisibilityResDataOutcome,
@ -162,6 +163,10 @@ const fetchRoomData = traceFunction(async function (
accessToken: matrixAccessToken,
abortSignal,
}),
fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.topic'), {
accessToken: matrixAccessToken,
abortSignal,
}),
fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.canonical_alias'), {
accessToken: matrixAccessToken,
abortSignal,
@ -197,6 +202,12 @@ const fetchRoomData = traceFunction(async function (
canonicalAlias = data?.content?.alias;
}
let topic;
if (stateTopicResDataOutcome.reason === undefined) {
const { data } = stateTopicResDataOutcome.value;
topic = data?.content?.topic;
}
let avatarUrl;
if (stateAvatarResDataOutcome.reason === undefined) {
const { data } = stateAvatarResDataOutcome.value;
@ -236,6 +247,7 @@ const fetchRoomData = traceFunction(async function (
return {
id: roomId,
name,
topic,
canonicalAlias,
avatarUrl,
historyVisibility,

View File

@ -26,6 +26,7 @@ const renderHydrogenVmRenderScriptToPageHtml = require('../hydrogen-render/rende
const setHeadersToPreloadAssets = require('../lib/set-headers-to-preload-assets');
const setHeadersForDateTemporalContext = require('../lib/set-headers-for-date-temporal-context');
const MatrixPublicArchiveURLCreator = require('matrix-public-archive-shared/lib/url-creator');
const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw');
const {
MS_LOOKUP,
TIME_PRECISION_VALUES,
@ -896,9 +897,16 @@ router.get(
shouldIndex = roomData?.historyVisibility === `world_readable`;
}
const isNsfw = checkTextForNsfw(
// We concat the name, topic, etc together to simply do a single check against
// all of the text.
`${roomData.name} --- ${roomData.canonicalAlias} --- ${roomData.topic} `
);
const pageOptions = {
title: `${roomData.name} - Matrix Public Archive`,
description: `View the history of ${roomData.name} in the Matrix Public Archive`,
blockedBySafeSearch: isNsfw,
entryPoint: 'client/js/entry-client-hydrogen.js',
locationHref: urlJoin(basePath, req.originalUrl),
shouldIndex,

View File

@ -0,0 +1,14 @@
'use strict';
const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+'];
const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`\\b${word}\\b`, 'i'));
// A very basic check for NSFW content that just looks for some keywords in the given
// text
function checkTextForNsfw(text) {
const isNsfw = NSFW_REGEXES.some((regex) => regex.test(text));
return isNsfw;
}
module.exports = checkTextForNsfw;

View File

@ -8,12 +8,10 @@ const LOCAL_STORAGE_KEYS = require('matrix-public-archive-shared/lib/local-stora
const ModalViewModel = require('matrix-public-archive-shared/viewmodels/ModalViewModel');
const HomeserverSelectionModalContentViewModel = require('matrix-public-archive-shared/viewmodels/HomeserverSelectionModalContentViewModel');
const RoomCardViewModel = require('matrix-public-archive-shared/viewmodels/RoomCardViewModel');
const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw');
const DEFAULT_SERVER_LIST = ['matrix.org', 'gitter.im', 'libera.chat'];
const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+'];
const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`\\b${word}\\b`, 'i'));
class RoomDirectoryViewModel extends ViewModel {
constructor(options) {
super(options);
@ -267,9 +265,7 @@ class RoomDirectoryViewModel extends ViewModel {
this._roomCardViewModelsFilterMap.setApply((roomId, vm) => {
// We concat the name, topic, etc together to simply do a single check against
// all of the text.
const isNsfw = NSFW_REGEXES.some((regex) =>
regex.test(vm.name + ' ---- ' + vm.canonicalAlias + ' --- ' + vm.topic)
);
const isNsfw = checkTextForNsfw(vm.name + ' --- ' + vm.canonicalAlias + ' --- ' + vm.topic);
vm.setBlockedBySafeSearch(isNsfw);
});
} else {

View File

@ -633,6 +633,58 @@ describe('matrix-public-archive', () => {
}
});
describe('safe search', () => {
[
{
testName: 'nsfw words in title',
createRoomOptions: {
name: `uranus-nsfw`,
},
},
{
testName: 'nsfw words in topic',
createRoomOptions: {
name: `mars`,
topic: 'Get your ass to mars (NSFW)',
},
},
].forEach((testCase) => {
it(`${testCase.testName} is correctly blocked/marked by safe search`, async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client, testCase.createRoomOptions);
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForDate(roomId, archiveDate);
const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl);
const dom = parseHTML(archivePageHtml);
// Make sure the `<meta name="rating" ...>` tag exists on the page
// telling search engines that this is an adult page.
const metaElements = Array.from(dom.document.querySelectorAll('meta'));
assert.strictEqual(
dom.document.querySelector(`meta[name="rating"]`)?.getAttribute('content'),
'adult',
`Unable to find <meta name="rating" content="adult"> on the page. We found these meta elements though:${metaElements
// eslint-disable-next-line max-nested-callbacks
.map((metaElement) => `\n \`${metaElement.outerHTML}\``)
.join('')}`
);
});
});
it('normal room is not blocked/marked by safe search', async () => {
const client = await getTestClientForHs(testMatrixServerUrl1);
const roomId = await createTestRoom(client);
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForDate(roomId, archiveDate);
const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl);
const dom = parseHTML(archivePageHtml);
// Make sure the `<meta name="rating" ...>` tag does NOT exist on the
// page telling search engines that this is an adult page.
assert.strictEqual(dom.document.querySelector(`meta[name="rating"]`), null);
});
});
describe('time selector', () => {
it('shows time selector when there are too many messages from the same day', async () => {
// Set this low so it's easy to hit the limit