oai-reverse-proxy/src/shared/prompt-logging/backends/sheets.ts

423 lines
12 KiB
TypeScript

/* Google Sheets backend for prompt logger. Upon every flush, this backend
writes the batch to a Sheets spreadsheet. If the sheet becomes too large, it
will create a new sheet and continue writing there.
This is essentially a really shitty ORM for Sheets. Absolutely no concurrency
support because it relies on local state to match up with the remote state. */
import { google, sheets_v4 } from "googleapis";
import type { CredentialBody } from "google-auth-library";
import type { GaxiosResponse } from "googleapis-common";
import { config } from "../../../config";
import { logger } from "../../../logger";
import { PromptLogEntry } from "..";
// There is always a sheet called __index__ which contains a list of all the
// other sheets. We use this rather than iterating over all the sheets in case
// the user needs to manually work with the spreadsheet.
// If no __index__ sheet exists, we will assume that the spreadsheet is empty
// and create one.
type IndexSheetModel = {
/**
* Stored in cell B2. Set on startup; if it changes, we assume that another
* instance of the proxy is writing to the spreadsheet and stop.
*/
lockId: string;
/**
* Data starts at row 4. Row 1-3 are headers
*/
rows: { logSheetName: string; createdAt: string; rowCount: number }[];
};
type LogSheetModel = {
sheetName: string;
rows: {
model: string;
endpoint: string;
promptRaw: string;
promptFlattened: string;
response: string;
}[];
};
const MAX_ROWS_PER_SHEET = 2000;
const log = logger.child({ module: "sheets" });
let sheetsClient: sheets_v4.Sheets | null = null;
/** Called when log backend aborts to tell the log queue to stop. */
let stopCallback: (() => void) | null = null;
/** Lock/synchronization ID for this session. */
let lockId = Math.random().toString(36).substring(2, 15);
/** In-memory cache of the index sheet. */
let indexSheet: IndexSheetModel | null = null;
/** In-memory cache of the active log sheet. */
let activeLogSheet: LogSheetModel | null = null;
/**
* Loads the __index__ sheet into memory. By default, asserts that the lock ID
* has not changed since the start of the session.
*/
const loadIndexSheet = async (assertLockId = true) => {
const client = sheetsClient!;
const spreadsheetId = config.googleSheetsSpreadsheetId!;
log.info({ assertLockId }, "Loading __index__ sheet.");
const res = await client.spreadsheets.values.get({
spreadsheetId: spreadsheetId,
range: "__index__!A1:D",
majorDimension: "ROWS",
});
const data = assertData(res);
if (!data.values || data.values[2][0] !== "logSheetName") {
log.error({ values: data.values }, "Unexpected format for __index__ sheet");
throw new Error("Unexpected format for __index__ sheet");
}
if (assertLockId) {
const lockIdCell = data.values[1][1];
if (lockIdCell !== lockId) {
log.error(
{ receivedLock: lockIdCell, expectedLock: lockId },
"Another instance of the proxy is writing to the spreadsheet; stopping."
);
stop();
throw new Error(`Lock ID assertion failed`);
}
}
const rows = data.values.slice(3).map((row) => {
return {
logSheetName: row[0],
createdAt: row[1],
rowCount: row[2],
};
});
indexSheet = { lockId, rows };
};
/** Creates empty __index__ sheet for a new spreadsheet. */
const createIndexSheet = async () => {
const client = sheetsClient!;
const spreadsheetId = config.googleSheetsSpreadsheetId!;
log.info("Creating empty __index__ sheet.");
const res = await client.spreadsheets.batchUpdate({
spreadsheetId: spreadsheetId,
requestBody: {
requests: [
{
addSheet: {
properties: {
title: "__index__",
gridProperties: { rowCount: 1, columnCount: 3 },
},
},
},
],
},
});
assertData(res);
indexSheet = { lockId, rows: [] };
await writeIndexSheet();
};
/** Writes contents of in-memory indexSheet to the remote __index__ sheet. */
const writeIndexSheet = async () => {
const client = sheetsClient!;
const spreadsheetId = config.googleSheetsSpreadsheetId!;
const headerRows = [
["Don't edit this sheet while the server is running.", "", ""],
["Lock ID", lockId, ""],
["logSheetName", "createdAt", "rowCount"],
];
const contentRows = indexSheet!.rows.map((row) => {
return [row.logSheetName, row.createdAt, row.rowCount];
});
log.info("Persisting __index__ sheet.");
await client.spreadsheets.values.batchUpdate({
spreadsheetId: spreadsheetId,
requestBody: {
valueInputOption: "RAW",
data: [
{ range: "__index__!A1:D", values: [...headerRows, ...contentRows] },
],
},
});
};
/** Creates a new log sheet, adds it to the index, and sets it as active. */
const createLogSheet = async () => {
const client = sheetsClient!;
const spreadsheetId = config.googleSheetsSpreadsheetId!;
// Sheet name format is Log_YYYYMMDD_HHMMSS
const sheetName = `Log_${new Date()
.toISOString()
// YYYY-MM-DDTHH:MM:SS.sssZ -> YYYYMMDD_HHMMSS
.replace(/[-:.]/g, "")
.replace(/T/, "_")
.substring(0, 15)}`;
log.info({ sheetName }, "Creating new log sheet.");
const res = await client.spreadsheets.batchUpdate({
spreadsheetId: spreadsheetId,
requestBody: {
requests: [
{
addSheet: {
properties: {
title: sheetName,
gridProperties: { rowCount: MAX_ROWS_PER_SHEET, columnCount: 5 },
},
},
},
],
},
});
assertData(res);
// Increase row/column size and wrap text for readability.
const sheetId = res.data.replies![0].addSheet!.properties!.sheetId;
await client.spreadsheets.batchUpdate({
spreadsheetId: spreadsheetId,
requestBody: {
requests: [
{
repeatCell: {
range: { sheetId },
cell: {
userEnteredFormat: {
wrapStrategy: "WRAP",
verticalAlignment: "TOP",
},
},
fields: "*",
},
},
{
updateDimensionProperties: {
range: {
sheetId,
dimension: "COLUMNS",
startIndex: 3,
endIndex: 5,
},
properties: { pixelSize: 500 },
fields: "pixelSize",
},
},
{
updateDimensionProperties: {
range: {
sheetId,
dimension: "ROWS",
startIndex: 1,
},
properties: { pixelSize: 200 },
fields: "pixelSize",
},
},
],
},
});
await client.spreadsheets.values.batchUpdate({
spreadsheetId: spreadsheetId,
requestBody: {
valueInputOption: "RAW",
data: [
{
range: `${sheetName}!A1:E`,
values: [
["model", "endpoint", "prompt json", "prompt string", "response"],
],
},
],
},
});
indexSheet!.rows.push({
logSheetName: sheetName,
createdAt: new Date().toISOString(),
rowCount: 0,
});
await writeIndexSheet();
activeLogSheet = { sheetName, rows: [] };
};
export const appendBatch = async (batch: PromptLogEntry[]) => {
if (!activeLogSheet) {
// Create a new log sheet if we don't have one yet.
await createLogSheet();
} else {
// Check lock to ensure we're the only instance writing to the spreadsheet.
await loadIndexSheet(true);
}
const client = sheetsClient!;
const spreadsheetId = config.googleSheetsSpreadsheetId!;
const sheetName = activeLogSheet!.sheetName;
const newRows = batch.map((entry) => {
return [
entry.model,
entry.endpoint,
entry.promptRaw.slice(-50000),
entry.promptFlattened.slice(-50000),
entry.response.slice(0, 50000),
];
});
log.info({ sheetName, rowCount: newRows.length }, "Appending log batch.");
const data = await client.spreadsheets.values.append({
spreadsheetId: spreadsheetId,
range: `${sheetName}!A1:D`,
valueInputOption: "RAW",
requestBody: { values: newRows, majorDimension: "ROWS" },
});
assertData(data);
if (data.data.updates && data.data.updates.updatedRows) {
const newRowCount = data.data.updates.updatedRows;
log.info({ sheetName, rowCount: newRowCount }, "Successfully appended.");
activeLogSheet!.rows = activeLogSheet!.rows.concat(
newRows.map((row) => ({
model: row[0],
endpoint: row[1],
promptRaw: row[2],
promptFlattened: row[3],
response: row[4],
}))
);
} else {
// We didn't receive an error but we didn't get any updates either.
// We may need to create a new sheet and throw to make the queue retry the
// batch.
log.warn(
{ sheetName, rowCount: newRows.length },
"No updates received from append. Creating new sheet and retrying."
);
await createLogSheet();
throw new Error("No updates received from append.");
}
await finalizeBatch();
};
const finalizeBatch = async () => {
const sheetName = activeLogSheet!.sheetName;
const rowCount = activeLogSheet!.rows.length;
const indexRow = indexSheet!.rows.find(
({ logSheetName }) => logSheetName === sheetName
)!;
indexRow.rowCount = rowCount;
if (rowCount >= MAX_ROWS_PER_SHEET) {
await createLogSheet(); // Also updates index sheet
} else {
await writeIndexSheet();
}
log.info({ sheetName, rowCount }, "Batch finalized.");
};
type LoadLogSheetArgs = {
sheetName: string;
/** The starting row to load. If omitted, loads all rows (expensive). */
fromRow?: number;
};
/** Not currently used. */
export const loadLogSheet = async ({
sheetName,
fromRow = 2, // omit header row
}: LoadLogSheetArgs) => {
const client = sheetsClient!;
const spreadsheetId = config.googleSheetsSpreadsheetId!;
const range = `${sheetName}!A${fromRow}:E`;
const res = await client.spreadsheets.values.get({
spreadsheetId: spreadsheetId,
range,
});
const data = assertData(res);
const values = data.values || [];
const rows = values.slice(1).map((row) => {
return {
model: row[0],
endpoint: row[1],
promptRaw: row[2],
promptFlattened: row[3],
response: row[4],
};
});
activeLogSheet = { sheetName, rows };
};
export const init = async (onStop: () => void) => {
if (sheetsClient) {
return;
}
if (!config.googleSheetsKey || !config.googleSheetsSpreadsheetId) {
throw new Error(
"Missing required Google Sheets config. Refer to documentation for setup instructions."
);
}
log.info("Initializing Google Sheets backend.");
const encodedCreds = config.googleSheetsKey;
// encodedCreds is a base64-encoded JSON key from the GCP console.
const creds: CredentialBody = JSON.parse(
Buffer.from(encodedCreds, "base64").toString("utf8").trim()
);
const auth = new google.auth.GoogleAuth({
scopes: ["https://www.googleapis.com/auth/spreadsheets"],
credentials: creds,
});
sheetsClient = google.sheets({ version: "v4", auth });
stopCallback = onStop;
const sheetId = config.googleSheetsSpreadsheetId;
const res = await sheetsClient.spreadsheets.get({
spreadsheetId: sheetId,
});
if (!res.data) {
const { status, statusText, headers } = res;
log.error(
{
res: { status, statusText, headers },
creds: {
client_email: creds.client_email?.slice(0, 5) + "********",
private_key: creds.private_key?.slice(0, 5) + "********",
},
sheetId: config.googleSheetsSpreadsheetId,
},
"Could not connect to Google Sheets."
);
stop();
throw new Error("Could not connect to Google Sheets.");
} else {
const sheetTitle = res.data.properties?.title;
log.info({ sheetId, sheetTitle }, "Connected to Google Sheets.");
}
// Load or create the index sheet and write the lockId to it.
try {
log.info("Loading index sheet.");
await loadIndexSheet(false);
await writeIndexSheet();
} catch (e) {
log.warn(e, "Could not load index sheet. Creating a new one.");
await createIndexSheet();
}
};
/** Called during some unrecoverable error to tell the log queue to stop. */
function stop() {
log.warn("Stopping Google Sheets backend.");
if (stopCallback) {
stopCallback();
}
sheetsClient = null;
}
function assertData<T = sheets_v4.Schema$ValueRange>(res: GaxiosResponse<T>) {
if (!res.data) {
const { status, statusText, headers } = res;
log.error(
{ res: { status, statusText, headers } },
"Unexpected response from Google Sheets API."
);
}
return res.data!;
}