Implement priority queue by tokens (khanon/oai-reverse-proxy!79)

This commit is contained in:
honeytree 2024-09-09 16:48:46 +00:00 committed by khanon
parent ac1897fd17
commit bd87ca60f7
4 changed files with 70 additions and 1 deletions

View File

@ -176,3 +176,10 @@ GCP_CREDENTIALS=project-id:client-email:region:private-key
# With prompt logging, the Google Sheets credentials. # With prompt logging, the Google Sheets credentials.
# GOOGLE_SHEETS_SPREADSHEET_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # GOOGLE_SHEETS_SPREADSHEET_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# GOOGLE_SHEETS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx # GOOGLE_SHEETS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# Prioritize requests in the queue according to their
# token count, placing larger requests further back.
#
# Punishes requests with a second's delay per 1k tokens
# when the value is 1.0, two seconds when it's 2, etc.
# TOKENS_PUNISHMENT_FACTOR=0.0

53
scripts/test-queue.js Normal file
View File

@ -0,0 +1,53 @@
const axios = require("axios");
function randomInteger(max) {
return Math.floor(Math.random() * max + 1);
}
async function testQueue() {
const requests = Array(10).fill(undefined).map(async function() {
const maxTokens = randomInteger(2000);
const headers = {
"Authorization": "Bearer test",
"Content-Type": "application/json",
"X-Forwarded-For": `${randomInteger(255)}.${randomInteger(255)}.${randomInteger(255)}.${randomInteger(255)}`,
};
const payload = {
model: "gpt-4o-mini-2024-07-18",
max_tokens: 20 + maxTokens,
stream: false,
messages: [{role: "user", content: "You are being benchmarked regarding your reliability at outputting exact, machine-comprehensible data. Output the sentence \"The quick brown fox jumps over the lazy dog.\" Do not precede it with quotemarks or any form of preamble, and do not output anything after the sentence."}],
temperature: 0,
};
try {
const response = await axios.post(
"http://localhost:7860/proxy/openai/v1/chat/completions",
payload,
{ headers }
);
if (response.status !== 200) {
console.error(`Request {$maxTokens} finished with status code ${response.status} and response`, response.data);
return;
}
const content = response.data.choices[0].message.content;
console.log(
`Request ${maxTokens} `,
content === "The quick brown fox jumps over the lazy dog." ? "OK" : `mangled: ${content}`
);
} catch (error) {
const msg = error.response;
console.error(`Error in req ${maxTokens}:`, error.message, msg || "");
}
});
await Promise.all(requests);
console.log("All requests finished");
}
testQueue();

View File

@ -378,6 +378,13 @@ type Config = {
* Takes precedence over the adminWhitelist. * Takes precedence over the adminWhitelist.
*/ */
ipBlacklist: string[]; ipBlacklist: string[];
/**
* If set, pushes requests further back into the queue according to their
* token costs by factor*tokens*milliseconds (or more intuitively
* factor*thousands_of_tokens*seconds).
* Accepts floats.
*/
tokensPunishmentFactor: number;
}; };
// To change configs, create a file called .env in the root directory. // To change configs, create a file called .env in the root directory.
@ -483,6 +490,7 @@ export const config: Config = {
getEnvWithDefault("ADMIN_WHITELIST", "0.0.0.0/0,::/0") getEnvWithDefault("ADMIN_WHITELIST", "0.0.0.0/0,::/0")
), ),
ipBlacklist: parseCsv(getEnvWithDefault("IP_BLACKLIST", "")), ipBlacklist: parseCsv(getEnvWithDefault("IP_BLACKLIST", "")),
tokensPunishmentFactor: getEnvWithDefault("TOKENS_PUNISHMENT_FACTOR", 0.0),
} as const; } as const;
function generateSigningKey() { function generateSigningKey() {

View File

@ -13,6 +13,7 @@
import crypto from "crypto"; import crypto from "crypto";
import { Handler, Request } from "express"; import { Handler, Request } from "express";
import { config } from "../config";
import { BadRequestError, TooManyRequestsError } from "../shared/errors"; import { BadRequestError, TooManyRequestsError } from "../shared/errors";
import { keyPool } from "../shared/key-management"; import { keyPool } from "../shared/key-management";
import { import {
@ -140,7 +141,7 @@ export function dequeue(partition: ModelFamily): Request | undefined {
} }
const req = modelQueue.reduce((prev, curr) => const req = modelQueue.reduce((prev, curr) =>
prev.startTime < curr.startTime ? prev : curr prev.startTime + config.tokensPunishmentFactor*((prev.promptTokens ?? 0) + (prev.outputTokens ?? 0)) < curr.startTime + config.tokensPunishmentFactor*((curr.promptTokens ?? 0) + (curr.outputTokens ?? 0)) ? prev : curr
); );
queue.splice(queue.indexOf(req), 1); queue.splice(queue.indexOf(req), 1);