Show per-model queues and keys on info page (khanon/oai-reverse-proxy!22)

2023-06-08 18:50:04 +00:00 · 2023-06-08 18:50:04 +00:00 · 4f2a12ef14
parent 120040c028
commit 4f2a12ef14
2 changed files with 164 additions and 95 deletions
--- a/src/info-page.ts
+++ b/src/info-page.ts
@ -4,7 +4,11 @@ import showdown from "showdown";
 import { config, listConfig } from "./config";
 import { keyPool } from "./key-management";
 import { getUniqueIps } from "./proxy/rate-limit";
-import { getEstimatedWaitTime, getQueueLength } from "./proxy/queue";
+import {
+  QueuePartition,
+  getEstimatedWaitTime,
+  getQueueLength,
+} from "./proxy/queue";

 const INFO_PAGE_TTL = 5000;
 let infoPageHtml: string | undefined;
@ -16,86 +20,33 @@ export const handleInfoPage = (req: Request, res: Response) => {
    return;
  }

-  const baseUrl = process.env.SPACE_ID
-    ? getExternalUrlForHuggingfaceSpaceId(process.env.SPACE_ID)
-    : req.protocol + "://" + req.get("host");
+  // Sometimes huggingface doesn't send the host header and makes us guess.
+  const baseUrl =
+    process.env.SPACE_ID && !req.get("host")?.includes("hf.space")
+      ? getExternalUrlForHuggingfaceSpaceId(process.env.SPACE_ID)
+      : req.protocol + "://" + req.get("host");

  res.send(cacheInfoPageHtml(baseUrl));
 };

 function cacheInfoPageHtml(baseUrl: string) {
  const keys = keyPool.list();
-  let keyInfo: Record<string, any> = { all: keys.length };

-  const openAIKeys = keys.filter((k) => k.service === "openai");
-  const anthropicKeys = keys.filter((k) => k.service === "anthropic");
-
-  let anthropicInfo: Record<string, any> = {
-    all: anthropicKeys.length,
-    active: anthropicKeys.filter((k) => !k.isDisabled).length,
-  };
-  let openAIInfo: Record<string, any> = {
-    all: openAIKeys.length,
-    active: openAIKeys.filter((k) => !k.isDisabled).length,
-  };
-
-  if (keyPool.anyUnchecked()) {
-    const uncheckedKeys = keys.filter((k) => !k.lastChecked);
-    openAIInfo = {
-      ...openAIInfo,
-      active: keys.filter((k) => !k.isDisabled).length,
-      status: `Still checking ${uncheckedKeys.length} keys...`,
-    };
-  } else if (config.checkKeys) {
-    const trialKeys = openAIKeys.filter((k) => k.isTrial);
-    const turboKeys = openAIKeys.filter((k) => !k.isGpt4 && !k.isDisabled);
-    const gpt4Keys = openAIKeys.filter((k) => k.isGpt4 && !k.isDisabled);
-
-    const quota: Record<string, string> = { turbo: "", gpt4: "" };
-    const hasGpt4 = openAIKeys.some((k) => k.isGpt4);
-    const turboQuota = keyPool.remainingQuota("openai") * 100;
-    const gpt4Quota = keyPool.remainingQuota("openai", { gpt4: true }) * 100;
-
-    if (config.quotaDisplayMode === "full") {
-      const turboUsage = keyPool.usageInUsd("openai");
-      const gpt4Usage = keyPool.usageInUsd("openai", { gpt4: true });
-      quota.turbo = `${turboUsage} (${Math.round(turboQuota)}% remaining)`;
-      quota.gpt4 = `${gpt4Usage} (${Math.round(gpt4Quota)}% remaining)`;
-    } else {
-      quota.turbo = `${Math.round(turboQuota)}%`;
-      quota.gpt4 = `${Math.round(gpt4Quota * 100)}%`;
-    }
-
-    if (!hasGpt4) {
-      delete quota.gpt4;
-    }
-
-    openAIInfo = {
-      ...openAIInfo,
-      trial: trialKeys.length,
-      active: {
-        turbo: turboKeys.length,
-        ...(hasGpt4 ? { gpt4: gpt4Keys.length } : {}),
-      },
-      ...(config.quotaDisplayMode !== "none" ? { quota: quota } : {}),
-    };
-  }
-
-  keyInfo = {
-    ...(openAIKeys.length ? { openai: openAIInfo } : {}),
-    ...(anthropicKeys.length ? { anthropic: anthropicInfo } : {}),
-  };
+  const openaiKeys = keys.filter((k) => k.service === "openai").length;
+  const anthropicKeys = keys.filter((k) => k.service === "anthropic").length;

  const info = {
    uptime: process.uptime(),
    endpoints: {
-      openai: baseUrl + "/proxy/openai",
-      anthropic: baseUrl + "/proxy/anthropic",
+      ...(openaiKeys ? { openai: baseUrl + "/proxy/openai" } : {}),
+      ...(anthropicKeys ? { anthropic: baseUrl + "/proxy/anthropic" } : {}),
    },
    proompts: keys.reduce((acc, k) => acc + k.promptCount, 0),
    ...(config.modelRateLimit ? { proomptersNow: getUniqueIps() } : {}),
-    ...getQueueInformation(),
-    keys: keyInfo,
+    openaiKeys,
+    anthropicKeys,
+    ...(openaiKeys ? getOpenAIInfo() : {}),
+    ...(anthropicKeys ? getAnthropicInfo() : {}),
    config: listConfig(),
    build: process.env.BUILD_INFO || "dev",
  };
@ -124,6 +75,98 @@ function cacheInfoPageHtml(baseUrl: string) {
  return pageBody;
 }

+type ServiceInfo = {
+  activeKeys: number;
+  trialKeys?: number;
+  quota: string;
+  proomptersInQueue: number;
+  estimatedQueueTime: string;
+};
+
+// this has long since outgrown this awful "dump everything in a <pre> tag" approach
+// but I really don't want to spend time on a proper UI for this right now
+
+function getOpenAIInfo() {
+  const info: { [model: string]: Partial<ServiceInfo> } = {};
+  const keys = keyPool.list().filter((k) => k.service === "openai");
+  const hasGpt4 = keys.some((k) => k.isGpt4);
+
+  if (keyPool.anyUnchecked()) {
+    const uncheckedKeys = keys.filter((k) => !k.lastChecked);
+    info.status = `Still checking ${uncheckedKeys.length} keys...` as any;
+  } else {
+    delete info.status;
+  }
+
+  if (config.checkKeys) {
+    const turboKeys = keys.filter((k) => !k.isGpt4 && !k.isDisabled);
+    const gpt4Keys = keys.filter((k) => k.isGpt4 && !k.isDisabled);
+
+    const quota: Record<string, string> = { turbo: "", gpt4: "" };
+    const turboQuota = keyPool.remainingQuota("openai") * 100;
+    const gpt4Quota = keyPool.remainingQuota("openai", { gpt4: true }) * 100;
+
+    if (config.quotaDisplayMode === "full") {
+      const turboUsage = keyPool.usageInUsd("openai");
+      const gpt4Usage = keyPool.usageInUsd("openai", { gpt4: true });
+      quota.turbo = `${turboUsage} (${Math.round(turboQuota)}% remaining)`;
+      quota.gpt4 = `${gpt4Usage} (${Math.round(gpt4Quota)}% remaining)`;
+    } else {
+      quota.turbo = `${Math.round(turboQuota)}%`;
+      quota.gpt4 = `${Math.round(gpt4Quota * 100)}%`;
+    }
+
+    info.turbo = {
+      activeKeys: turboKeys.filter((k) => !k.isDisabled).length,
+      trialKeys: turboKeys.filter((k) => k.isTrial).length,
+      quota: quota.turbo,
+    };
+
+    if (hasGpt4) {
+      info.gpt4 = {
+        activeKeys: gpt4Keys.filter((k) => !k.isDisabled).length,
+        trialKeys: gpt4Keys.filter((k) => k.isTrial).length,
+        quota: quota.gpt4,
+      };
+    }
+
+    if (config.quotaDisplayMode === "none") {
+      delete info.turbo?.quota;
+      delete info.gpt4?.quota;
+    }
+  } else {
+    info.status = "Key checking is disabled." as any;
+    info.turbo = { activeKeys: keys.filter((k) => !k.isDisabled).length };
+  }
+
+  if (config.queueMode !== "none") {
+    const turboQueue = getQueueInformation("turbo");
+
+    info.turbo.proomptersInQueue = turboQueue.proomptersInQueue;
+    info.turbo.estimatedQueueTime = turboQueue.estimatedQueueTime;
+
+    if (hasGpt4) {
+      const gpt4Queue = getQueueInformation("gpt-4");
+      info.gpt4.proomptersInQueue = gpt4Queue.proomptersInQueue;
+      info.gpt4.estimatedQueueTime = gpt4Queue.estimatedQueueTime;
+    }
+  }
+
+  return info;
+}
+
+function getAnthropicInfo() {
+  const claudeInfo: Partial<ServiceInfo> = {};
+  const keys = keyPool.list().filter((k) => k.service === "anthropic");
+  claudeInfo.activeKeys = keys.filter((k) => !k.isDisabled).length;
+  if (config.queueMode !== "none") {
+    const queue = getQueueInformation("claude");
+    claudeInfo.proomptersInQueue = queue.proomptersInQueue;
+    claudeInfo.estimatedQueueTime = queue.estimatedQueueTime;
+  }
+  return { claude: claudeInfo };
+}
+
 /**
 * If the server operator provides a `greeting.md` file, it will be included in
 * the rendered info page.
@ -147,11 +190,23 @@ Logs are anonymous and do not contain IP addresses or timestamps. [You can see t
  }

  if (config.queueMode !== "none") {
-    const friendlyWaitTime = getQueueInformation().estimatedQueueTime;
-    infoBody += `\n### Estimated Wait Time: ${friendlyWaitTime}
-Queueing is enabled. If the AI is busy, your prompt will processed when a slot frees up.
+    const waits = [];
+    infoBody += `\n## Estimated Wait Times\nIf the AI is busy, your prompt will processed when a slot frees up.`;

-**Enable Streaming in your preferred front-end to prevent timeouts while waiting in the queue.**`;
+    if (config.openaiKey) {
+      const turboWait = getQueueInformation("turbo").estimatedQueueTime;
+      const gpt4Wait = getQueueInformation("gpt-4").estimatedQueueTime;
+      waits.push(`**Turbo:** ${turboWait}`);
+      if (keyPool.list().some((k) => k.isGpt4)) {
+        waits.push(`**GPT-4:** ${gpt4Wait}`);
+      }
+    }
+
+    if (config.anthropicKey) {
+      const claudeWait = getQueueInformation("claude").estimatedQueueTime;
+      waits.push(`**Claude:** ${claudeWait}`);
+    }
+    infoBody += "\n\n" + waits.join(" / ");
  }

  if (customGreeting) {
@ -162,11 +217,11 @@ ${customGreeting}`;
 }

 /** Returns queue time in seconds, or minutes + seconds if over 60 seconds. */
-function getQueueInformation() {
+function getQueueInformation(partition: QueuePartition) {
  if (config.queueMode === "none") {
    return {};
  }
-  const waitMs = getEstimatedWaitTime();
+  const waitMs = getEstimatedWaitTime(partition);
  const waitTime =
    waitMs < 60000
      ? `${Math.round(waitMs / 1000)}sec`
@ -174,7 +229,7 @@ function getQueueInformation() {
          (waitMs % 60000) / 1000
        )}sec`;
  return {
-    proomptersInQueue: getQueueLength(),
+    proomptersInQueue: getQueueLength(partition),
    estimatedQueueTime: waitMs > 2000 ? waitTime : "no wait",
  };
 }
--- a/src/proxy/queue.ts
+++ b/src/proxy/queue.ts
@ -22,6 +22,8 @@ import { logger } from "../logger";
 import { AGNAI_DOT_CHAT_IP } from "./rate-limit";
 import { buildFakeSseMessage } from "./middleware/common";

+export type QueuePartition = "claude" | "turbo" | "gpt-4";
+
 const queue: Request[] = [];
 const log = logger.child({ module: "request-queue" });

@ -89,7 +91,8 @@ export function enqueue(req: Request) {
        req.res!.write(": queue heartbeat\n\n");
      } else {
        req.log.info(`Sending heartbeat to request in queue.`);
-        const avgWait = Math.round(getEstimatedWaitTime() / 1000);
+        const partition = getPartitionForRequest(req);
+        const avgWait = Math.round(getEstimatedWaitTime(partition) / 1000);
        const currentDuration = Math.round((Date.now() - req.startTime) / 1000);
        const debugMsg = `queue length: ${queue.length}; elapsed time: ${currentDuration}s; avg wait: ${avgWait}s`;
        req.res!.write(buildFakeSseMessage("heartbeat", debugMsg, req));
@ -119,25 +122,29 @@ export function enqueue(req: Request) {
  }
 }

-type QueuePartition = "claude" | "turbo" | "gpt-4";
-export function dequeue(partition: QueuePartition): Request | undefined {
+function getPartitionForRequest(req: Request): QueuePartition {
  // There is a single request queue, but it is partitioned by model and API
  // provider.
  // - claude: requests for the Anthropic API, regardless of model
  // - gpt-4: requests for the OpenAI API, specifically for GPT-4 models
  // - turbo: effectively, all other requests
-  const modelQueue = queue.filter((req) => {
-    const provider = req.outboundApi;
-    const model = (req.body.model as SupportedModel) ?? "gpt-3.5-turbo";
-    switch (partition) {
-      case "claude":
-        return provider === "anthropic";
-      case "gpt-4":
-        return provider === "openai" && model.startsWith("gpt-4");
-      case "turbo":
-        return provider === "openai";
-    }
-  });
+  const provider = req.outboundApi;
+  const model = (req.body.model as SupportedModel) ?? "gpt-3.5-turbo";
+  if (provider === "anthropic") {
+    return "claude";
+  }
+  if (provider === "openai" && model.startsWith("gpt-4")) {
+    return "gpt-4";
+  }
+  return "turbo";
+}
+
+function getQueueForPartition(partition: QueuePartition): Request[] {
+  return queue.filter((req) => getPartitionForRequest(req) === partition);
+}
+
+export function dequeue(partition: QueuePartition): Request | undefined {
+  const modelQueue = getQueueForPartition(partition);

  if (modelQueue.length === 0) {
    return undefined;
@ -226,7 +233,7 @@ function cleanQueue() {
    (waitTime) => now - waitTime.end > 300 * 1000
  );
  const removed = waitTimes.splice(0, index + 1);
-  log.debug(
+  log.trace(
    { stalledRequests: oldRequests.length, prunedWaitTimes: removed.length },
    `Cleaning up request queue.`
  );
@ -239,20 +246,23 @@ export function start() {
  log.info(`Started request queue.`);
 }

-let waitTimes: { start: number; end: number }[] = [];
+let waitTimes: { partition: QueuePartition; start: number; end: number }[] = [];

 /** Adds a successful request to the list of wait times. */
 export function trackWaitTime(req: Request) {
  waitTimes.push({
+    partition: getPartitionForRequest(req),
    start: req.startTime!,
    end: req.queueOutTime ?? Date.now(),
  });
 }

 /** Returns average wait time in milliseconds. */
-export function getEstimatedWaitTime() {
+export function getEstimatedWaitTime(partition: QueuePartition) {
  const now = Date.now();
-  const recentWaits = waitTimes.filter((wt) => now - wt.end < 300 * 1000);
+  const recentWaits = waitTimes.filter(
+    (wt) => wt.partition === partition && now - wt.end < 300 * 1000
+  );
  if (recentWaits.length === 0) {
    return 0;
  }
@ -263,8 +273,12 @@ export function getEstimatedWaitTime() {
  );
 }

-export function getQueueLength() {
-  return queue.length;
+export function getQueueLength(partition: QueuePartition | "all" = "all") {
+  if (partition === "all") {
+    return queue.length;
+  }
+  const modelQueue = getQueueForPartition(partition);
+  return modelQueue.length;
 }

 export function createQueueMiddleware(proxyMiddleware: Handler): Handler {