removes QUEUE_MODE config (now always enabled)
This commit is contained in:
parent
5d3fb6af3a
commit
6bb67281d9
|
@ -11,7 +11,6 @@
|
||||||
# REJECT_MESSAGE="This content violates /aicg/'s acceptable use policy."
|
# REJECT_MESSAGE="This content violates /aicg/'s acceptable use policy."
|
||||||
# CHECK_KEYS=true
|
# CHECK_KEYS=true
|
||||||
# QUOTA_DISPLAY_MODE=full
|
# QUOTA_DISPLAY_MODE=full
|
||||||
# QUEUE_MODE=fair
|
|
||||||
# BLOCKED_ORIGINS=reddit.com,9gag.com
|
# BLOCKED_ORIGINS=reddit.com,9gag.com
|
||||||
# BLOCK_MESSAGE="You must be over the age of majority in your country to use this service."
|
# BLOCK_MESSAGE="You must be over the age of majority in your country to use this service."
|
||||||
# BLOCK_REDIRECT="https://roblox.com/"
|
# BLOCK_REDIRECT="https://roblox.com/"
|
||||||
|
|
|
@ -9,7 +9,6 @@ const startupLogger = pino({ level: "debug" }).child({ module: "startup" });
|
||||||
const isDev = process.env.NODE_ENV !== "production";
|
const isDev = process.env.NODE_ENV !== "production";
|
||||||
|
|
||||||
type PromptLoggingBackend = "google_sheets";
|
type PromptLoggingBackend = "google_sheets";
|
||||||
export type DequeueMode = "fair" | "random" | "none";
|
|
||||||
|
|
||||||
type Config = {
|
type Config = {
|
||||||
/** The port the proxy server will listen on. */
|
/** The port the proxy server will listen on. */
|
||||||
|
@ -107,16 +106,6 @@ type Config = {
|
||||||
* `full`: Displays information about keys' quota limits
|
* `full`: Displays information about keys' quota limits
|
||||||
*/
|
*/
|
||||||
quotaDisplayMode: "none" | "full";
|
quotaDisplayMode: "none" | "full";
|
||||||
/**
|
|
||||||
* Which request queueing strategy to use when keys are over their rate limit.
|
|
||||||
*
|
|
||||||
* `fair`: Requests are serviced in the order they were received (default)
|
|
||||||
*
|
|
||||||
* `random`: Requests are serviced randomly
|
|
||||||
*
|
|
||||||
* `none`: Requests are not queued and users have to retry manually
|
|
||||||
*/
|
|
||||||
queueMode: DequeueMode;
|
|
||||||
/**
|
/**
|
||||||
* Comma-separated list of origins to block. Requests matching any of these
|
* Comma-separated list of origins to block. Requests matching any of these
|
||||||
* origins or referers will be rejected.
|
* origins or referers will be rejected.
|
||||||
|
@ -179,7 +168,6 @@ export const config: Config = {
|
||||||
"GOOGLE_SHEETS_SPREADSHEET_ID",
|
"GOOGLE_SHEETS_SPREADSHEET_ID",
|
||||||
undefined
|
undefined
|
||||||
),
|
),
|
||||||
queueMode: getEnvWithDefault("QUEUE_MODE", "fair"),
|
|
||||||
blockedOrigins: getEnvWithDefault("BLOCKED_ORIGINS", undefined),
|
blockedOrigins: getEnvWithDefault("BLOCKED_ORIGINS", undefined),
|
||||||
blockMessage: getEnvWithDefault(
|
blockMessage: getEnvWithDefault(
|
||||||
"BLOCK_MESSAGE",
|
"BLOCK_MESSAGE",
|
||||||
|
|
|
@ -148,17 +148,15 @@ function getOpenAIInfo() {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.queueMode !== "none") {
|
const turboQueue = getQueueInformation("turbo");
|
||||||
const turboQueue = getQueueInformation("turbo");
|
|
||||||
|
|
||||||
info.turbo.proomptersInQueue = turboQueue.proomptersInQueue;
|
info.turbo.proomptersInQueue = turboQueue.proomptersInQueue;
|
||||||
info.turbo.estimatedQueueTime = turboQueue.estimatedQueueTime;
|
info.turbo.estimatedQueueTime = turboQueue.estimatedQueueTime;
|
||||||
|
|
||||||
if (hasGpt4) {
|
if (hasGpt4) {
|
||||||
const gpt4Queue = getQueueInformation("gpt-4");
|
const gpt4Queue = getQueueInformation("gpt-4");
|
||||||
info.gpt4.proomptersInQueue = gpt4Queue.proomptersInQueue;
|
info.gpt4.proomptersInQueue = gpt4Queue.proomptersInQueue;
|
||||||
info.gpt4.estimatedQueueTime = gpt4Queue.estimatedQueueTime;
|
info.gpt4.estimatedQueueTime = gpt4Queue.estimatedQueueTime;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return info;
|
return info;
|
||||||
|
@ -168,11 +166,9 @@ function getAnthropicInfo() {
|
||||||
const claudeInfo: Partial<ServiceInfo> = {};
|
const claudeInfo: Partial<ServiceInfo> = {};
|
||||||
const keys = keyPool.list().filter((k) => k.service === "anthropic");
|
const keys = keyPool.list().filter((k) => k.service === "anthropic");
|
||||||
claudeInfo.activeKeys = keys.filter((k) => !k.isDisabled).length;
|
claudeInfo.activeKeys = keys.filter((k) => !k.isDisabled).length;
|
||||||
if (config.queueMode !== "none") {
|
const queue = getQueueInformation("claude");
|
||||||
const queue = getQueueInformation("claude");
|
claudeInfo.proomptersInQueue = queue.proomptersInQueue;
|
||||||
claudeInfo.proomptersInQueue = queue.proomptersInQueue;
|
claudeInfo.estimatedQueueTime = queue.estimatedQueueTime;
|
||||||
claudeInfo.estimatedQueueTime = queue.estimatedQueueTime;
|
|
||||||
}
|
|
||||||
return { claude: claudeInfo };
|
return { claude: claudeInfo };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -198,26 +194,24 @@ Logs are anonymous and do not contain IP addresses or timestamps. [You can see t
|
||||||
**If you are uncomfortable with this, don't send prompts to this proxy!**`;
|
**If you are uncomfortable with this, don't send prompts to this proxy!**`;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.queueMode !== "none") {
|
const waits: string[] = [];
|
||||||
const waits: string[] = [];
|
infoBody += `\n## Estimated Wait Times\nIf the AI is busy, your prompt will processed when a slot frees up.`;
|
||||||
infoBody += `\n## Estimated Wait Times\nIf the AI is busy, your prompt will processed when a slot frees up.`;
|
|
||||||
|
|
||||||
if (config.openaiKey) {
|
if (config.openaiKey) {
|
||||||
const turboWait = getQueueInformation("turbo").estimatedQueueTime;
|
const turboWait = getQueueInformation("turbo").estimatedQueueTime;
|
||||||
const gpt4Wait = getQueueInformation("gpt-4").estimatedQueueTime;
|
const gpt4Wait = getQueueInformation("gpt-4").estimatedQueueTime;
|
||||||
waits.push(`**Turbo:** ${turboWait}`);
|
waits.push(`**Turbo:** ${turboWait}`);
|
||||||
if (keyPool.list().some((k) => k.isGpt4) && !config.turboOnly) {
|
if (keyPool.list().some((k) => k.isGpt4) && !config.turboOnly) {
|
||||||
waits.push(`**GPT-4:** ${gpt4Wait}`);
|
waits.push(`**GPT-4:** ${gpt4Wait}`);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.anthropicKey) {
|
|
||||||
const claudeWait = getQueueInformation("claude").estimatedQueueTime;
|
|
||||||
waits.push(`**Claude:** ${claudeWait}`);
|
|
||||||
}
|
|
||||||
infoBody += "\n\n" + waits.join(" / ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (config.anthropicKey) {
|
||||||
|
const claudeWait = getQueueInformation("claude").estimatedQueueTime;
|
||||||
|
waits.push(`**Claude:** ${claudeWait}`);
|
||||||
|
}
|
||||||
|
infoBody += "\n\n" + waits.join(" / ");
|
||||||
|
|
||||||
if (customGreeting) {
|
if (customGreeting) {
|
||||||
infoBody += `\n## Server Greeting\n
|
infoBody += `\n## Server Greeting\n
|
||||||
${customGreeting}`;
|
${customGreeting}`;
|
||||||
|
@ -227,9 +221,6 @@ ${customGreeting}`;
|
||||||
|
|
||||||
/** Returns queue time in seconds, or minutes + seconds if over 60 seconds. */
|
/** Returns queue time in seconds, or minutes + seconds if over 60 seconds. */
|
||||||
function getQueueInformation(partition: QueuePartition) {
|
function getQueueInformation(partition: QueuePartition) {
|
||||||
if (config.queueMode === "none") {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
const waitMs = getEstimatedWaitTime(partition);
|
const waitMs = getEstimatedWaitTime(partition);
|
||||||
const waitTime =
|
const waitTime =
|
||||||
waitMs < 60000
|
waitMs < 60000
|
||||||
|
|
|
@ -33,12 +33,6 @@ const rewriteRequest = (
|
||||||
req: Request,
|
req: Request,
|
||||||
res: Response
|
res: Response
|
||||||
) => {
|
) => {
|
||||||
if (config.queueMode !== "none") {
|
|
||||||
const msg = `Queueing is enabled on this proxy instance and is incompatible with the KoboldAI endpoint. Use the OpenAI endpoint instead.`;
|
|
||||||
proxyReq.destroy(new Error(msg));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
req.body.stream = false;
|
req.body.stream = false;
|
||||||
const rewriterPipeline = [
|
const rewriterPipeline = [
|
||||||
addKey,
|
addKey,
|
||||||
|
|
|
@ -341,11 +341,8 @@ function maybeHandleMissingPreambleError(
|
||||||
"Request failed due to missing preamble. Key will be marked as such for subsequent requests."
|
"Request failed due to missing preamble. Key will be marked as such for subsequent requests."
|
||||||
);
|
);
|
||||||
keyPool.update(req.key!, { requiresPreamble: true });
|
keyPool.update(req.key!, { requiresPreamble: true });
|
||||||
if (config.queueMode !== "none") {
|
reenqueueRequest(req);
|
||||||
reenqueueRequest(req);
|
throw new RetryableError("Claude request re-enqueued to add preamble.");
|
||||||
throw new RetryableError("Claude request re-enqueued to add preamble.");
|
|
||||||
}
|
|
||||||
errorPayload.proxy_note = `This Claude key requires special prompt formatting. Try again; the proxy will reformat your prompt next time.`;
|
|
||||||
} else {
|
} else {
|
||||||
errorPayload.proxy_note = `Proxy received unrecognized error from Anthropic. Check the specific error for more information.`;
|
errorPayload.proxy_note = `Proxy received unrecognized error from Anthropic. Check the specific error for more information.`;
|
||||||
}
|
}
|
||||||
|
@ -357,11 +354,8 @@ function handleAnthropicRateLimitError(
|
||||||
) {
|
) {
|
||||||
if (errorPayload.error?.type === "rate_limit_error") {
|
if (errorPayload.error?.type === "rate_limit_error") {
|
||||||
keyPool.markRateLimited(req.key!);
|
keyPool.markRateLimited(req.key!);
|
||||||
if (config.queueMode !== "none") {
|
reenqueueRequest(req);
|
||||||
reenqueueRequest(req);
|
throw new RetryableError("Claude rate-limited request re-enqueued.");
|
||||||
throw new RetryableError("Claude rate-limited request re-enqueued.");
|
|
||||||
}
|
|
||||||
errorPayload.proxy_note = `There are too many in-flight requests for this key. Try again later.`;
|
|
||||||
} else {
|
} else {
|
||||||
errorPayload.proxy_note = `Unrecognized rate limit error from Anthropic. Key may be over quota.`;
|
errorPayload.proxy_note = `Unrecognized rate limit error from Anthropic. Key may be over quota.`;
|
||||||
}
|
}
|
||||||
|
@ -388,13 +382,11 @@ function handleOpenAIRateLimitError(
|
||||||
} else if (type === "requests" || type === "tokens") {
|
} else if (type === "requests" || type === "tokens") {
|
||||||
// Per-minute request or token rate limit is exceeded, which we can retry
|
// Per-minute request or token rate limit is exceeded, which we can retry
|
||||||
keyPool.markRateLimited(req.key!);
|
keyPool.markRateLimited(req.key!);
|
||||||
if (config.queueMode !== "none") {
|
// I'm aware this is confusing -- throwing this class of error will cause
|
||||||
reenqueueRequest(req);
|
// the proxy response handler to return without terminating the request,
|
||||||
// This is confusing, but it will bubble up to the top-level response
|
// so that it can be placed back in the queue.
|
||||||
// handler and cause the request to go back into the request queue.
|
reenqueueRequest(req);
|
||||||
throw new RetryableError("Rate-limited request re-enqueued.");
|
throw new RetryableError("Rate-limited request re-enqueued.");
|
||||||
}
|
|
||||||
errorPayload.proxy_note = `Assigned key's '${type}' rate limit has been exceeded. Try again later.`;
|
|
||||||
} else {
|
} else {
|
||||||
// OpenAI probably overloaded
|
// OpenAI probably overloaded
|
||||||
errorPayload.proxy_note = `This is likely a temporary error with OpenAI. Try again in a few seconds.`;
|
errorPayload.proxy_note = `This is likely a temporary error with OpenAI. Try again in a few seconds.`;
|
||||||
|
|
|
@ -16,7 +16,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import type { Handler, Request } from "express";
|
import type { Handler, Request } from "express";
|
||||||
import { config, DequeueMode } from "../config";
|
|
||||||
import { keyPool, SupportedModel } from "../key-management";
|
import { keyPool, SupportedModel } from "../key-management";
|
||||||
import { logger } from "../logger";
|
import { logger } from "../logger";
|
||||||
import { AGNAI_DOT_CHAT_IP } from "./rate-limit";
|
import { AGNAI_DOT_CHAT_IP } from "./rate-limit";
|
||||||
|
@ -27,8 +26,6 @@ export type QueuePartition = "claude" | "turbo" | "gpt-4";
|
||||||
const queue: Request[] = [];
|
const queue: Request[] = [];
|
||||||
const log = logger.child({ module: "request-queue" });
|
const log = logger.child({ module: "request-queue" });
|
||||||
|
|
||||||
let dequeueMode: DequeueMode = "fair";
|
|
||||||
|
|
||||||
/** Maximum number of queue slots for Agnai.chat requests. */
|
/** Maximum number of queue slots for Agnai.chat requests. */
|
||||||
const AGNAI_CONCURRENCY_LIMIT = 15;
|
const AGNAI_CONCURRENCY_LIMIT = 15;
|
||||||
/** Maximum number of queue slots for individual users. */
|
/** Maximum number of queue slots for individual users. */
|
||||||
|
@ -160,18 +157,9 @@ export function dequeue(partition: QueuePartition): Request | undefined {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
let req: Request;
|
const req = modelQueue.reduce((prev, curr) =>
|
||||||
|
prev.startTime < curr.startTime ? prev : curr
|
||||||
if (dequeueMode === "fair") {
|
);
|
||||||
// Dequeue the request that has been waiting the longest
|
|
||||||
req = modelQueue.reduce((prev, curr) =>
|
|
||||||
prev.startTime < curr.startTime ? prev : curr
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
// Dequeue a random request
|
|
||||||
const index = Math.floor(Math.random() * modelQueue.length);
|
|
||||||
req = modelQueue[index];
|
|
||||||
}
|
|
||||||
queue.splice(queue.indexOf(req), 1);
|
queue.splice(queue.indexOf(req), 1);
|
||||||
|
|
||||||
if (req.onAborted) {
|
if (req.onAborted) {
|
||||||
|
@ -293,10 +281,6 @@ export function getQueueLength(partition: QueuePartition | "all" = "all") {
|
||||||
|
|
||||||
export function createQueueMiddleware(proxyMiddleware: Handler): Handler {
|
export function createQueueMiddleware(proxyMiddleware: Handler): Handler {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
if (config.queueMode === "none") {
|
|
||||||
return proxyMiddleware(req, res, next);
|
|
||||||
}
|
|
||||||
|
|
||||||
req.proceed = () => {
|
req.proceed = () => {
|
||||||
proxyMiddleware(req, res, next);
|
proxyMiddleware(req, res, next);
|
||||||
};
|
};
|
||||||
|
|
|
@ -102,10 +102,8 @@ async function start() {
|
||||||
logQueue.start();
|
logQueue.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (config.queueMode !== "none") {
|
logger.info("Starting request queue...");
|
||||||
logger.info("Starting request queue...");
|
startRequestQueue();
|
||||||
startRequestQueue();
|
|
||||||
}
|
|
||||||
|
|
||||||
app.listen(PORT, async () => {
|
app.listen(PORT, async () => {
|
||||||
logger.info({ port: PORT }, "Now listening for connections.");
|
logger.info({ port: PORT }, "Now listening for connections.");
|
||||||
|
|
Loading…
Reference in New Issue