Implement priority queue by tokens (khanon/oai-reverse-proxy!79)

2024-09-09 16:48:46 +00:00 · 2024-09-09 16:48:46 +00:00 · bd87ca60f7
parent ac1897fd17
commit bd87ca60f7
4 changed files with 70 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -176,3 +176,10 @@ GCP_CREDENTIALS=project-id:client-email:region:private-key
 # With prompt logging, the Google Sheets credentials.
 # GOOGLE_SHEETS_SPREADSHEET_ID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 # GOOGLE_SHEETS_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 # Prioritize requests in the queue according to their
 # token count, placing larger requests further back.
 #
 # Punishes requests with a second's delay per 1k tokens
 # when the value is 1.0, two seconds when it's 2, etc.
 # TOKENS_PUNISHMENT_FACTOR=0.0
--- a/scripts/test-queue.js
+++ b/scripts/test-queue.js
@ -0,0 +1,53 @@
 const axios = require("axios");
 function randomInteger(max) {
  return Math.floor(Math.random() * max + 1);
 }
 async function testQueue() {
  const requests = Array(10).fill(undefined).map(async function() {
    const maxTokens = randomInteger(2000);
    const headers = {
      "Authorization": "Bearer test",
      "Content-Type": "application/json",
      "X-Forwarded-For": `${randomInteger(255)}.${randomInteger(255)}.${randomInteger(255)}.${randomInteger(255)}`,
    };
    const payload = {
      model: "gpt-4o-mini-2024-07-18",
      max_tokens: 20 + maxTokens,
      stream: false,
      messages: [{role: "user", content: "You are being benchmarked regarding your reliability at outputting exact, machine-comprehensible data. Output the sentence \"The quick brown fox jumps over the lazy dog.\" Do not precede it with quotemarks or any form of preamble, and do not output anything after the sentence."}],
      temperature: 0,
    };
    try {
      const response = await axios.post(
        "http://localhost:7860/proxy/openai/v1/chat/completions",
        payload,
        { headers }
      );
            if (response.status !== 200) {
          console.error(`Request {$maxTokens} finished with status code ${response.status} and response`, response.data);
          return;
        }
      const content = response.data.choices[0].message.content;
      console.log(
        `Request ${maxTokens} `,
        content === "The quick brown fox jumps over the lazy dog." ? "OK" : `mangled: ${content}`
      );
    } catch (error) {
      const msg = error.response;
      console.error(`Error in req ${maxTokens}:`, error.message, msg || "");
    }
  });
  await Promise.all(requests);
  console.log("All requests finished");
 }
 testQueue();
--- a/src/config.ts
+++ b/src/config.ts
@ -378,6 +378,13 @@ type Config = {
   * Takes precedence over the adminWhitelist.
   */
  ipBlacklist: string[];
  /**
   * If set, pushes requests further back into the queue according to their
   * token costs by factor*tokens*milliseconds (or more intuitively
   * factor*thousands_of_tokens*seconds).
   * Accepts floats.
   */
  tokensPunishmentFactor: number;
 };
 // To change configs, create a file called .env in the root directory.
@ -483,6 +490,7 @@ export const config: Config = {
    getEnvWithDefault("ADMIN_WHITELIST", "0.0.0.0/0,::/0")
  ),
  ipBlacklist: parseCsv(getEnvWithDefault("IP_BLACKLIST", "")),
  tokensPunishmentFactor: getEnvWithDefault("TOKENS_PUNISHMENT_FACTOR", 0.0),
 } as const;
 function generateSigningKey() {
--- a/src/proxy/queue.ts
+++ b/src/proxy/queue.ts
@ -13,6 +13,7 @@
 import crypto from "crypto";
 import { Handler, Request } from "express";
 import { config } from "../config";
 import { BadRequestError, TooManyRequestsError } from "../shared/errors";
 import { keyPool } from "../shared/key-management";
 import {
@ -140,7 +141,7 @@ export function dequeue(partition: ModelFamily): Request | undefined {
  }
  const req = modelQueue.reduce((prev, curr) =>
-    prev.startTime < curr.startTime ? prev : curr
+    prev.startTime + config.tokensPunishmentFactor*((prev.promptTokens ?? 0) + (prev.outputTokens ?? 0)) < curr.startTime + config.tokensPunishmentFactor*((curr.promptTokens ?? 0) + (curr.outputTokens ?? 0)) ? prev : curr
  );
  queue.splice(queue.indexOf(req), 1);