OpenTelemetry tracing so we can see spans where the app is taking time (#27)

OpenTelemetry tracing so we can see spans where the app is taking time.
For the user, we specifically show the spans for the external API HTTP requests
that are slow (so we know when the Matrix API is being slow).

Enable tracing:

 - `npm run start -- --tracing`
 - `npm run start-dev -- --tracing`

What does this PR change:

 - Adds OpenTelemetry tracing with some of the automatic instrumentation (includes HTTP and express)
    - We ignore traces for serving static assets (just noise)
 - Adds `X-Trace-Id` to the response headers
 - Adds `window.tracingSpansForRequest` which includes the external HTTP API requests made during the request
 - Adds a fancy 504 timeout page that includes trace details and lists the slow HTTP requests
 - Adds `jaegerTracesEndpoint` configuration to export tracing spans to Jaeger
 - Related to, https://github.com/matrix-org/matrix-public-archive/issues/26
This commit is contained in:
Eric Eastwood 2022-07-14 11:08:50 -05:00 committed by GitHub
parent 13eb92b067
commit ddfe94beab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 5150 additions and 1917 deletions

View File

@ -3,6 +3,8 @@
"basePath": "http://localhost:3050",
"matrixServerUrl": "http://localhost:8008/",
"archiveMessageLimit": 500,
"requestTimeoutMs": 25000,
"UNCOMMENT_jaegerTracesEndpoint": "http://localhost:14268/api/traces",
"testMatrixServerUrl1": "http://localhost:11008/",
"testMatrixServerUrl2": "http://localhost:12008/",

6468
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -30,6 +30,17 @@
"vite": "^2.9.6"
},
"dependencies": {
"@opentelemetry/api": "^1.1.0",
"@opentelemetry/auto-instrumentations-node": "^0.31.0",
"@opentelemetry/context-async-hooks": "^1.4.0",
"@opentelemetry/core": "^1.4.0",
"@opentelemetry/exporter-jaeger": "^1.3.1",
"@opentelemetry/instrumentation": "^0.30.0",
"@opentelemetry/propagator-ot-trace": "^0.26.0",
"@opentelemetry/resources": "^1.3.1",
"@opentelemetry/sdk-trace-base": "^1.3.1",
"@opentelemetry/semantic-conventions": "^1.3.1",
"dompurify": "^2.3.9",
"express": "^4.17.2",
"hydrogen-view-sdk": "npm:@mlm/hydrogen-view-sdk@^0.0.13-scratch",
"linkedom": "^0.14.1",

View File

@ -153,3 +153,26 @@ body {
.CalendarView_dayLink_disabled {
opacity: 0.5;
}
/* Error pages */
.heading-sub-detail {
font-weight: normal;
}
.tracing-span-list {
}
.tracing-span-list-item {
margin-bottom: 8px;
}
.tracing-span-item-http-details {
}
.tracing-span-item-sub-details {
margin-left: 4ch;
}

57
server/README.md Normal file
View File

@ -0,0 +1,57 @@
## Tracing
Run the app with the OpenTelemetry tracing.
```
npm run start -- --tracing
# or
npm run start-dev -- --tracing
```
Manually:
```
node --require './server/tracing.js' server/server.js
```
---
Traces are made up of many spans. Each span defines a `traceId` which it is associated with.
## Viewing traces in Jaeger
- Dashboard where users can see visualisations -> http://localhost:16686
via https://www.jaegertracing.io/docs/1.35/getting-started/
```
docker run -d --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
-e COLLECTOR_OTLP_ENABLED=true \
-p 6831:6831/udp \
-p 6832:6832/udp \
-p 5778:5778 \
-p 16686:16686 \
-p 4317:4317 \
-p 4318:4318 \
-p 14250:14250 \
-p 14268:14268 \
-p 14269:14269 \
-p 9411:9411 \
jaegertracing/all-in-one:1.35
```
| Port | Protocol | Component | Function |
| ----- | -------- | --------- | -------------------------------------------------------------------------------------------- |
| 6831 | UDP | agent | accept jaeger.thrift over Thrift-compact protocol (used by most SDKs) |
| 6832 | UDP | agent | accept jaeger.thrift over Thrift-binary protocol (used by Node.js SDK) |
| 5775 | UDP | agent | (deprecated) accept zipkin.thrift over compact Thrift protocol (used by legacy clients only) |
| 5778 | HTTP | agent | serve configs (sampling, etc.) |
| 16686 | HTTP | query | serve frontend |
| 4317 | HTTP | collector | accept OpenTelemetry Protocol (OTLP) over gRPC, if enabled |
| 4318 | HTTP | collector | accept OpenTelemetry Protocol (OTLP) over HTTP, if enabled |
| 14268 | HTTP | collector | accept jaeger.thrift directly from clients |
| 14250 | HTTP | collector | accept model.proto |
| 9411 | HTTP | collector | Zipkin compatible endpoint (optional) |
With Service Performance Monitoring (SPM)

10
server/lib/safe-json.js Normal file
View File

@ -0,0 +1,10 @@
'use strict';
// via https://gitlab.com/gitterHQ/webapp/-/blob/615c78d0b0a314c2c9e4098b8d2ba0471d16961b/modules/templates/lib/safe-json.js
function safeJson(string) {
if (!string) return string;
// From http://benalpert.com/2012/08/03/preventing-xss-json.html
return string.replace(/<\//g, '<\\/');
}
module.exports = safeJson;

View File

@ -0,0 +1,21 @@
'use strict';
const createDOMPurify = require('dompurify');
const { parseHTML } = require('linkedom');
const dom = parseHTML(`
<!doctype html>
<html>
<head></head>
<body></body>
</html>
`);
const DOMPurify = createDOMPurify(dom.window);
function sanitizeHtml(dirtyHtml) {
const cleanHtml = DOMPurify.sanitize(dirtyHtml);
return cleanHtml;
}
module.exports = sanitizeHtml;

View File

@ -7,9 +7,14 @@ const express = require('express');
const asyncHandler = require('../lib/express-async-handler');
const StatusError = require('../lib/status-error');
const { handleTracingMiddleware, getSerializableSpans } = require('../tracing/tracing-middleware');
const timeoutMiddleware = require('./timeout-middleware');
const fetchRoomData = require('../fetch-room-data');
const fetchEventsInRange = require('../fetch-events-in-range');
const renderHydrogenToString = require('../hydrogen-render/1-render-hydrogen-to-string');
const sanitizeHtml = require('../lib/sanitize-html');
const safeJson = require('../lib/safe-json');
const config = require('../lib/config');
const basePath = config.get('basePath');
@ -63,9 +68,14 @@ function parseArchiveRangeFromReq(req) {
}
function installRoutes(app) {
app.get('/health-check', async function (req, res) {
res.send('{ "ok": true }');
});
app.use(handleTracingMiddleware);
app.get(
'/health-check',
asyncHandler(async function (req, res) {
res.send('{ "ok": true }');
})
);
// We have to disable no-missing-require lint because it doesn't take into
// account `package.json`. `exports`, see
@ -73,25 +83,34 @@ function installRoutes(app) {
// eslint-disable-next-line node/no-missing-require
app.use(express.static(path.dirname(require.resolve('hydrogen-view-sdk/assets/main.js'))));
app.get('/hydrogen-styles.css', async function (req, res) {
res.set('Content-Type', 'text/css');
// We have to disable no-missing-require lint because it doesn't take into
// account `package.json`. `exports`, see
// https://github.com/mysticatea/eslint-plugin-node/issues/255
// eslint-disable-next-line node/no-missing-require
res.sendFile(require.resolve('hydrogen-view-sdk/assets/theme-element-light.css'));
});
app.get(
'/hydrogen-styles.css',
asyncHandler(async function (req, res) {
res.set('Content-Type', 'text/css');
// We have to disable no-missing-require lint because it doesn't take into
// account `package.json`. `exports`, see
// https://github.com/mysticatea/eslint-plugin-node/issues/255
// eslint-disable-next-line node/no-missing-require
res.sendFile(require.resolve('hydrogen-view-sdk/assets/theme-element-light.css'));
})
);
// Our own archive app styles
app.get('/styles.css', async function (req, res) {
res.set('Content-Type', 'text/css');
res.sendFile(path.join(__dirname, '../../public/styles/styles.css'));
});
app.get(
'/styles.css',
asyncHandler(async function (req, res) {
res.set('Content-Type', 'text/css');
res.sendFile(path.join(__dirname, '../../public/styles/styles.css'));
})
);
app.get('/matrix-public-archive.js', async function (req, res) {
res.set('Content-Type', 'text/css');
res.sendFile(path.join(__dirname, '../../dist/matrix-public-archive.es.js'));
});
app.get(
'/matrix-public-archive.js',
asyncHandler(async function (req, res) {
res.set('Content-Type', 'text/css');
res.sendFile(path.join(__dirname, '../../dist/matrix-public-archive.es.js'));
})
);
app.get(
'/:roomIdOrAlias/event/:eventId',
@ -106,6 +125,7 @@ function installRoutes(app) {
// https://gitlab.com/gitterHQ/webapp/-/blob/14954e05c905e8c7cb675efebb89116c07cfaab5/server/handlers/app/archive.js#L190-297
app.get(
'/:roomIdOrAlias/date/:yyyy(\\d{4})/:mm(\\d{2})/:dd(\\d{2})/:hourRange(\\d\\d?-\\d\\d?)?',
timeoutMiddleware,
asyncHandler(async function (req, res) {
const roomIdOrAlias = req.params.roomIdOrAlias;
assert(roomIdOrAlias.startsWith('!') || roomIdOrAlias.startsWith('#'));
@ -157,6 +177,9 @@ function installRoutes(app) {
stateEventMap,
});
const serializableSpans = getSerializableSpans();
const serializedSpans = JSON.stringify(serializableSpans);
const hydrogenStylesUrl = urlJoin(basePath, 'hydrogen-styles.css');
const stylesUrl = urlJoin(basePath, 'styles.css');
const jsBundleUrl = urlJoin(basePath, 'matrix-public-archive.js');
@ -164,15 +187,19 @@ function installRoutes(app) {
<!doctype html>
<html lang="en">
<head>
<link href="${hydrogenStylesUrl}" rel="stylesheet">
<link href="${stylesUrl}" rel="stylesheet">
${sanitizeHtml(`<title>${roomData.name} - Matrix Public Archive</title>`)}
<link href="${hydrogenStylesUrl}" rel="stylesheet">
<link href="${stylesUrl}" rel="stylesheet">
</head>
<body>
${hydrogenHtmlOutput}
<script type="text/javascript" src="${jsBundleUrl}"></script>
<script type="text/javascript">window.tracingSpansForRequest = ${safeJson(
serializedSpans
)};</script>
</body>
</html>
`;
`;
res.set('Content-Type', 'text/html');
res.send(pageHtml);

View File

@ -0,0 +1,98 @@
'use strict';
const assert = require('assert');
const urlJoin = require('url-join');
const asyncHandler = require('../lib/express-async-handler');
const { getSerializableSpans, getActiveTraceId } = require('../tracing/tracing-middleware');
const sanitizeHtml = require('../lib/sanitize-html');
const safeJson = require('../lib/safe-json');
const config = require('../lib/config');
const basePath = config.get('basePath');
assert(basePath);
const requestTimeoutMs = config.get('requestTimeoutMs');
assert(requestTimeoutMs);
// Based off of the `connect-timeout` middleware,
// https://github.com/expressjs/timeout/blob/f2f520f335f2f2ae255d4778e908e8d38e3a4e68/index.js
async function timeoutMiddleware(req, res, next) {
const timeoutId = setTimeout(() => {
const traceId = getActiveTraceId();
const serializableSpans = getSerializableSpans();
const serializedSpans = JSON.stringify(serializableSpans);
let humanReadableSpans;
if (serializableSpans.length > 0) {
humanReadableSpans = serializableSpans.map((serializableSpan) => {
const method = serializableSpan.attributes['http.method'];
const url = serializableSpan.attributes['http.url'];
const statusCode = serializableSpan.attributes['http.status_code'];
let durationString = `request is still running (${
Date.now() - serializableSpan.startTimeInMs
}ms so far)`;
if (serializableSpan.durationInMs) {
durationString = `took ${serializableSpan.durationInMs}ms`;
}
return `<li class="tracing-span-list-item">
<div class="tracing-span-item-http-details">${statusCode ?? '🏃'}: ${method} ${url}</div>
<div class="tracing-span-item-sub-details">${durationString}</div>
</li>`;
});
} else {
const noTracingDataAvailableItem = `<li class="tracing-span-list-item">
<div class="tracing-span-item-http-details">No tracing data available</div>
</li>`;
humanReadableSpans = [noTracingDataAvailableItem];
}
const hydrogenStylesUrl = urlJoin(basePath, 'hydrogen-styles.css');
const stylesUrl = urlJoin(basePath, 'styles.css');
const pageHtml = `
<!doctype html>
<html lang="en">
<head>
<title>Server timeout - Matrix Public Archive</title>
<link href="${hydrogenStylesUrl}" rel="stylesheet">
<link href="${stylesUrl}" rel="stylesheet">
</head>
${/* We add the .hydrogen class here just to get normal body styles */ ''}
<body class="hydrogen">
<h1>504: Server timeout</h1>
<p>Server was unable to respond in time (${requestTimeoutMs / 1000}s)</p>
<h3>These are the external API requests that made it slow:</h3>
${sanitizeHtml(`<ul class="tracing-span-list">
${humanReadableSpans.join('\n')}
</ul>`)}
${sanitizeHtml(
`<h2>Trace ID: <span class="heading-sub-detail">${
traceId ?? `none (tracing is probably not enabled)`
}</span></h2>`
)}
<script type="text/javascript">window.tracingSpansForRequest = ${safeJson(
serializedSpans
)};</script>
</body>
</html>
`;
// 504 Gateway timeout
res.status(504);
res.set('Content-Type', 'text/html');
res.send(pageHtml);
}, requestTimeoutMs);
res.on('finish', function () {
clearTimeout(timeoutId);
});
next();
}
module.exports = asyncHandler(timeoutMiddleware);

View File

@ -2,6 +2,12 @@
console.log('server process.env.NODE_ENV', process.env.NODE_ENV);
if (process.argv.includes('--tracing')) {
console.log('Tracing is active 🕵️');
const { startTracing } = require('./tracing/tracing.js');
startTracing();
}
const express = require('express');
const installRoutes = require('./routes/install-routes');

View File

@ -19,6 +19,11 @@ build(
})
);
const args = [];
if (process.argv.includes('--tracing')) {
args.push('--tracing');
}
// Listen for any changes to files and restart the Node.js server process
//
// For API docs, see
@ -28,6 +33,7 @@ nodemon({
ext: 'js json',
ignoreRoot: ['.git'],
ignore: [path.join(__dirname, '../dist/*')],
args,
});
nodemon

View File

@ -0,0 +1,71 @@
'use strict';
//const { SpanProcessor } = require('@opentelemetry/sdk-trace-base');
const { suppressTracing } = require('@opentelemetry/core');
const { context } = require('@opentelemetry/api');
// 1. Keeps track of all spans for a given trace after calling
// `trackSpansInTrace(traceId)` (call this in a middleware before any other
// routes).
// 2. Then during the request, you can see all spans for a given trace with
// `getSpansInTrace(traceId)`.
// 3. Don't forget to clean up with `dropSpansInTrace(traceId)` after you're
// done with the spans (should be done in the `res.on('finish', ...)`
// callback).
//
// NoopSpanProcessor.ts reference:
// https://github.com/open-telemetry/opentelemetry-js/blob/747c404fb1295d54eb628a434ebf782d2f180bfb/packages/opentelemetry-sdk-trace-base/src/export/NoopSpanProcessor.ts
class CaptureSpanProcessor {
// Map from traceId to spans in the trace
traceMap = {};
// We capture when the span starts so that we get any ongoing spans if the
// request times out and we want to show what it was stuck on.
onStart(span /*, ctx*/) {
// prevent downstream exporter calls from generating spans
context.with(suppressTracing(context.active()), () => {
const traceIdsToTrack = Object.keys(this.traceMap);
const traceId = span.spanContext().traceId;
if (traceIdsToTrack.includes(traceId)) {
this.traceMap[traceId].push(span);
}
});
}
onEnd(/*span*/) {
/* noop */
}
shutdown() {
/* noop */
return Promise.resolve();
}
forceFlush() {
/* noop */
return Promise.resolve();
}
// Get all spans for a given trace.
getSpansInTrace(traceId) {
return this.traceMap[traceId];
}
// Keeps track of all spans for a given trace after calling
// `trackSpansInTrace(traceId)` (call this in a middleware before any other
// routes).
trackSpansInTrace(traceId) {
this.traceMap[traceId] = [];
}
// Don't forget to clean up with `dropSpansInTrace(traceId)` after you're done
// with the spans (should be done in the `res.on('finish', ...)` callback).
//
// alias: Dispose
dropSpansInTrace(traceId) {
delete this.traceMap[traceId];
}
}
module.exports = CaptureSpanProcessor;

View File

@ -0,0 +1,44 @@
'use strict';
const {
hrTimeToMilliseconds,
//hrTimeToMicroseconds
} = require('@opentelemetry/core');
const SAFE_ATTRIBUTES = ['http.method', 'http.url', 'http.status_code', 'http.target'];
// Convert a `Span` object to a plain old JavaScript object with only the info
// we care about and that is safe to share. We want something we can JSON
// serialize.
function serializeSpan(span) {
const spanContext = span.spanContext();
const safeAttributes = {};
SAFE_ATTRIBUTES.forEach((safeAttribute) => {
safeAttributes[safeAttribute] = span.attributes[safeAttribute];
});
const startTimeInMs = hrTimeToMilliseconds(span.startTime);
const endTimeInMs = hrTimeToMilliseconds(span.endTime);
let durationInMs = null;
if (startTimeInMs && endTimeInMs) {
durationInMs = endTimeInMs - startTimeInMs;
}
return {
name: span.name,
spanContext: {
traceId: spanContext.traceId,
spanId: spanContext.spanId,
},
//parentSpanId: span.parentSpanId,
startTimeInMs,
endTimeInMs,
durationInMs,
attributes: safeAttributes,
//span.links
};
}
module.exports = serializeSpan;

View File

@ -0,0 +1,72 @@
'use strict';
const opentelemetryApi = require('@opentelemetry/api');
const asyncHandler = require('../lib/express-async-handler');
const { captureSpanProcessor } = require('./tracing');
const serializeSpan = require('./serialize-span');
// From the current active context, grab the `traceId`. The `traceId` will be
// shared for the whole request because all spans live under the root span.
function getActiveTraceId() {
//const rootCtx = opentelemetryApi.ROOT_CONTEXT;
const activeCtx = opentelemetryApi.context.active();
if (activeCtx) {
const span = opentelemetryApi.trace.getSpan(activeCtx);
if (span) {
const traceId = span.spanContext().traceId;
return traceId;
}
}
}
// Handles keeping track of tracing spans for each request.
// Also adds the traceId to the to the `X-Trace-Id` response header.
async function handleTracingMiddleware(req, res, next) {
const traceId = getActiveTraceId();
if (traceId) {
// Add the OpenTelemetry trace ID to the `X-Trace-Id` response header so
// we can cross-reference. We can use this to lookup the request in
// Jaeger.
res.set('X-Trace-Id', traceId);
// Start keeping track of all of spans that happen during the request
captureSpanProcessor.trackSpansInTrace(traceId);
// Cleanup after the request is done
res.on('finish', function () {
captureSpanProcessor.dropSpansInTrace(traceId);
});
}
next();
}
// Get all of spans we're willing to show to the user.
//
// We only care about showing the external API HTTP requests to the user so they
// can tell what part of the Matrix API is being so slow.
function getSerializableSpans() {
const traceId = getActiveTraceId();
if (traceId) {
const spans = captureSpanProcessor.getSpansInTrace(traceId) ?? [];
// We only care about showing the external API HTTP requests to the user
const filteredSpans = spans.filter((span) => {
return span.instrumentationLibrary.name === '@opentelemetry/instrumentation-http';
});
const serializableSpans = filteredSpans.map((span) => serializeSpan(span));
return serializableSpans;
}
return [];
}
module.exports = {
handleTracingMiddleware: asyncHandler(handleTracingMiddleware),
getSerializableSpans,
getActiveTraceId,
};

107
server/tracing/tracing.js Normal file
View File

@ -0,0 +1,107 @@
'use strict';
const assert = require('assert');
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
const { diag, DiagConsoleLogger, DiagLogLevel } = require('@opentelemetry/api');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const {
BasicTracerProvider,
// ConsoleSpanExporter,
// SimpleSpanProcessor,
BatchSpanProcessor,
} = require('@opentelemetry/sdk-trace-base');
const { AsyncLocalStorageContextManager } = require('@opentelemetry/context-async-hooks');
const { OTTracePropagator } = require('@opentelemetry/propagator-ot-trace');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
const CaptureSpanProcessor = require('./capture-span-processor');
const config = require('../lib/config');
const jaegerTracesEndpoint = config.get('jaegerTracesEndpoint');
const packageInfo = require('../../package.json');
assert(packageInfo.name);
// (Diagnostics) For troubleshooting, set the log level to DiagLogLevel.DEBUG.
//
diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.INFO);
const provider = new BasicTracerProvider({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: packageInfo.name,
}),
});
// Export spans to console (useful for debugging).
// provider.addSpanProcessor(new SimpleSpanProcessor(new ConsoleSpanExporter()));
// Export spans to Jaeger
let isExportingToJaeger = false;
if (jaegerTracesEndpoint) {
const exporter = new JaegerExporter({
tags: [],
endpoint: jaegerTracesEndpoint,
});
// Use this for immediate span submission
//provider.addSpanProcessor(new SimpleSpanProcessor(exporter));
// Otherwise, we should just use the more performant batched processor
provider.addSpanProcessor(new BatchSpanProcessor(exporter));
isExportingToJaeger = true;
}
const captureSpanProcessor = new CaptureSpanProcessor();
provider.addSpanProcessor(captureSpanProcessor);
function startTracing() {
if (!isExportingToJaeger) {
console.warn(
`⚠ Tracing was started but \`jaegerTracesEndpoint\` was not configured so we are not exporting anything.`
);
}
provider.register({
contextManager: new AsyncLocalStorageContextManager(),
propagator: new OTTracePropagator(),
});
registerInstrumentations({
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-http': {
// Docs:
// https://github.com/open-telemetry/opentelemetry-js/tree/51afd54bd63e46d5d530266761144c7be2f6b3a7/experimental/packages/opentelemetry-instrumentation-http#http-instrumentation-options
//
// This is the place to ignore root level spans for Express routes. My
// first guess would be in `ignoreLayers` in
// `@opentelemetry/instrumentation-express` but that's not the case, see
// https://github.com/open-telemetry/opentelemetry-js-contrib/issues/1034#issuecomment-1158435392
ignoreIncomingRequestHook: (req) => {
// Ignore spans from static assets.
//
// FIXME: Ideally, all of the assets would all be served under
// `/static/` so we could ignore that way instead. In Hydrogen, this
// is tracked by https://github.com/vector-im/hydrogen-web/issues/757
const isStaticAsset = !!req.url.match(/\.(css|js|svg|woff2)(\?.*?)?$/);
return isStaticAsset;
},
},
}),
],
});
process.on('SIGTERM', () => {
provider
.shutdown()
.then(() => console.log('Tracing terminated'))
.catch((error) => console.log('Error terminating tracing', error))
.finally(() => process.exit(0));
});
}
module.exports = {
startTracing,
provider,
captureSpanProcessor,
};