Mark NSFW room pages with `<meta name="rating" content="adult">` (#216)
Related docs: - https://developers.google.com/search/docs/crawling-indexing/safesearch - https://developers.google.com/search/docs/crawling-indexing/special-tags
This commit is contained in:
parent
aeceb195e2
commit
198e8c09be
|
@ -35,7 +35,14 @@ function renderPageHtml({
|
|||
// We shouldn't let some pages be indexed by search engines
|
||||
let maybeNoIndexHtml = '';
|
||||
if (!pageOptions.shouldIndex) {
|
||||
maybeNoIndexHtml = `<meta name="robots" content="noindex, nofollow" />`;
|
||||
maybeNoIndexHtml = `<meta name="robots" content="noindex, nofollow">`;
|
||||
}
|
||||
|
||||
// We should tell search engines that some pages are NSFW, see
|
||||
// https://developers.google.com/search/docs/crawling-indexing/safesearch
|
||||
let maybeAdultMeta = '';
|
||||
if (pageOptions.blockedBySafeSearch) {
|
||||
maybeAdultMeta = `<meta name="rating" content="adult">`;
|
||||
}
|
||||
|
||||
const faviconMap = getFaviconAssetUrls();
|
||||
|
@ -45,6 +52,7 @@ function renderPageHtml({
|
|||
<head>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
${maybeNoIndexHtml}
|
||||
${maybeAdultMeta}
|
||||
${sanitizeHtml(`<title>${pageOptions.title}</title>`)}
|
||||
${sanitizeHtml(`<meta name="description" content="${pageOptions.description}">`)}
|
||||
<link rel="icon" href="${faviconMap.ico}" sizes="any">
|
||||
|
|
|
@ -151,6 +151,7 @@ const fetchRoomData = traceFunction(async function (
|
|||
|
||||
const [
|
||||
stateNameResDataOutcome,
|
||||
stateTopicResDataOutcome,
|
||||
stateCanonicalAliasResDataOutcome,
|
||||
stateAvatarResDataOutcome,
|
||||
stateHistoryVisibilityResDataOutcome,
|
||||
|
@ -162,6 +163,10 @@ const fetchRoomData = traceFunction(async function (
|
|||
accessToken: matrixAccessToken,
|
||||
abortSignal,
|
||||
}),
|
||||
fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.topic'), {
|
||||
accessToken: matrixAccessToken,
|
||||
abortSignal,
|
||||
}),
|
||||
fetchEndpointAsJson(getStateEndpointForRoomIdAndEventType(roomId, 'm.room.canonical_alias'), {
|
||||
accessToken: matrixAccessToken,
|
||||
abortSignal,
|
||||
|
@ -197,6 +202,12 @@ const fetchRoomData = traceFunction(async function (
|
|||
canonicalAlias = data?.content?.alias;
|
||||
}
|
||||
|
||||
let topic;
|
||||
if (stateTopicResDataOutcome.reason === undefined) {
|
||||
const { data } = stateTopicResDataOutcome.value;
|
||||
topic = data?.content?.topic;
|
||||
}
|
||||
|
||||
let avatarUrl;
|
||||
if (stateAvatarResDataOutcome.reason === undefined) {
|
||||
const { data } = stateAvatarResDataOutcome.value;
|
||||
|
@ -236,6 +247,7 @@ const fetchRoomData = traceFunction(async function (
|
|||
return {
|
||||
id: roomId,
|
||||
name,
|
||||
topic,
|
||||
canonicalAlias,
|
||||
avatarUrl,
|
||||
historyVisibility,
|
||||
|
|
|
@ -26,6 +26,7 @@ const renderHydrogenVmRenderScriptToPageHtml = require('../hydrogen-render/rende
|
|||
const setHeadersToPreloadAssets = require('../lib/set-headers-to-preload-assets');
|
||||
const setHeadersForDateTemporalContext = require('../lib/set-headers-for-date-temporal-context');
|
||||
const MatrixPublicArchiveURLCreator = require('matrix-public-archive-shared/lib/url-creator');
|
||||
const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw');
|
||||
const {
|
||||
MS_LOOKUP,
|
||||
TIME_PRECISION_VALUES,
|
||||
|
@ -896,9 +897,16 @@ router.get(
|
|||
shouldIndex = roomData?.historyVisibility === `world_readable`;
|
||||
}
|
||||
|
||||
const isNsfw = checkTextForNsfw(
|
||||
// We concat the name, topic, etc together to simply do a single check against
|
||||
// all of the text.
|
||||
`${roomData.name} --- ${roomData.canonicalAlias} --- ${roomData.topic} `
|
||||
);
|
||||
|
||||
const pageOptions = {
|
||||
title: `${roomData.name} - Matrix Public Archive`,
|
||||
description: `View the history of ${roomData.name} in the Matrix Public Archive`,
|
||||
blockedBySafeSearch: isNsfw,
|
||||
entryPoint: 'client/js/entry-client-hydrogen.js',
|
||||
locationHref: urlJoin(basePath, req.originalUrl),
|
||||
shouldIndex,
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
'use strict';
|
||||
|
||||
const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+'];
|
||||
const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`\\b${word}\\b`, 'i'));
|
||||
|
||||
// A very basic check for NSFW content that just looks for some keywords in the given
|
||||
// text
|
||||
function checkTextForNsfw(text) {
|
||||
const isNsfw = NSFW_REGEXES.some((regex) => regex.test(text));
|
||||
|
||||
return isNsfw;
|
||||
}
|
||||
|
||||
module.exports = checkTextForNsfw;
|
|
@ -8,12 +8,10 @@ const LOCAL_STORAGE_KEYS = require('matrix-public-archive-shared/lib/local-stora
|
|||
const ModalViewModel = require('matrix-public-archive-shared/viewmodels/ModalViewModel');
|
||||
const HomeserverSelectionModalContentViewModel = require('matrix-public-archive-shared/viewmodels/HomeserverSelectionModalContentViewModel');
|
||||
const RoomCardViewModel = require('matrix-public-archive-shared/viewmodels/RoomCardViewModel');
|
||||
const checkTextForNsfw = require('matrix-public-archive-shared/lib/check-text-for-nsfw');
|
||||
|
||||
const DEFAULT_SERVER_LIST = ['matrix.org', 'gitter.im', 'libera.chat'];
|
||||
|
||||
const NSFW_WORDS = ['nsfw', 'porn', 'nudes', 'sex', '18+'];
|
||||
const NSFW_REGEXES = NSFW_WORDS.map((word) => new RegExp(`\\b${word}\\b`, 'i'));
|
||||
|
||||
class RoomDirectoryViewModel extends ViewModel {
|
||||
constructor(options) {
|
||||
super(options);
|
||||
|
@ -267,9 +265,7 @@ class RoomDirectoryViewModel extends ViewModel {
|
|||
this._roomCardViewModelsFilterMap.setApply((roomId, vm) => {
|
||||
// We concat the name, topic, etc together to simply do a single check against
|
||||
// all of the text.
|
||||
const isNsfw = NSFW_REGEXES.some((regex) =>
|
||||
regex.test(vm.name + ' ---- ' + vm.canonicalAlias + ' --- ' + vm.topic)
|
||||
);
|
||||
const isNsfw = checkTextForNsfw(vm.name + ' --- ' + vm.canonicalAlias + ' --- ' + vm.topic);
|
||||
vm.setBlockedBySafeSearch(isNsfw);
|
||||
});
|
||||
} else {
|
||||
|
|
|
@ -633,6 +633,58 @@ describe('matrix-public-archive', () => {
|
|||
}
|
||||
});
|
||||
|
||||
describe('safe search', () => {
|
||||
[
|
||||
{
|
||||
testName: 'nsfw words in title',
|
||||
createRoomOptions: {
|
||||
name: `uranus-nsfw`,
|
||||
},
|
||||
},
|
||||
{
|
||||
testName: 'nsfw words in topic',
|
||||
createRoomOptions: {
|
||||
name: `mars`,
|
||||
topic: 'Get your ass to mars (NSFW)',
|
||||
},
|
||||
},
|
||||
].forEach((testCase) => {
|
||||
it(`${testCase.testName} is correctly blocked/marked by safe search`, async () => {
|
||||
const client = await getTestClientForHs(testMatrixServerUrl1);
|
||||
const roomId = await createTestRoom(client, testCase.createRoomOptions);
|
||||
|
||||
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForDate(roomId, archiveDate);
|
||||
const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl);
|
||||
const dom = parseHTML(archivePageHtml);
|
||||
|
||||
// Make sure the `<meta name="rating" ...>` tag exists on the page
|
||||
// telling search engines that this is an adult page.
|
||||
const metaElements = Array.from(dom.document.querySelectorAll('meta'));
|
||||
assert.strictEqual(
|
||||
dom.document.querySelector(`meta[name="rating"]`)?.getAttribute('content'),
|
||||
'adult',
|
||||
`Unable to find <meta name="rating" content="adult"> on the page. We found these meta elements though:${metaElements
|
||||
// eslint-disable-next-line max-nested-callbacks
|
||||
.map((metaElement) => `\n \`${metaElement.outerHTML}\``)
|
||||
.join('')}`
|
||||
);
|
||||
});
|
||||
});
|
||||
|
||||
it('normal room is not blocked/marked by safe search', async () => {
|
||||
const client = await getTestClientForHs(testMatrixServerUrl1);
|
||||
const roomId = await createTestRoom(client);
|
||||
|
||||
archiveUrl = matrixPublicArchiveURLCreator.archiveUrlForDate(roomId, archiveDate);
|
||||
const { data: archivePageHtml } = await fetchEndpointAsText(archiveUrl);
|
||||
const dom = parseHTML(archivePageHtml);
|
||||
|
||||
// Make sure the `<meta name="rating" ...>` tag does NOT exist on the
|
||||
// page telling search engines that this is an adult page.
|
||||
assert.strictEqual(dom.document.querySelector(`meta[name="rating"]`), null);
|
||||
});
|
||||
});
|
||||
|
||||
describe('time selector', () => {
|
||||
it('shows time selector when there are too many messages from the same day', async () => {
|
||||
// Set this low so it's easy to hit the limit
|
||||
|
|
Loading…
Reference in New Issue