oai-reverse-proxy/src/shared/key-management/openai/provider.ts

/* Manages OpenAI API keys. Tracks usage, disables expired keys, and provides
round-robin access to keys. Keys are stored in the OPENAI_KEY environment
variable as a comma-separated list of keys. */
import crypto from "crypto";
import http from "http";
import { Key, KeyProvider, Model } from "../index";
import { config } from "../../../config";
import { logger } from "../../../logger";
import { OpenAIKeyChecker } from "./checker";
import { getOpenAIModelFamily, OpenAIModelFamily } from "../../models";

export type OpenAIModel =
  | "gpt-3.5-turbo"
  | "gpt-3.5-turbo-instruct"
  | "gpt-4"
  | "gpt-4-32k"
  | "text-embedding-ada-002";
export const OPENAI_SUPPORTED_MODELS: readonly OpenAIModel[] = [
  "gpt-3.5-turbo",
  "gpt-3.5-turbo-instruct",
  "gpt-4",
] as const;

// Flattening model families instead of using a nested object for easier
// cloning.
type OpenAIKeyUsage = {
  [K in OpenAIModelFamily as `${K}Tokens`]: number;
};

export interface OpenAIKey extends Key, OpenAIKeyUsage {
  readonly service: "openai";
  modelFamilies: OpenAIModelFamily[];
  /**
   * Some keys are assigned to multiple organizations, each with their own quota
   * limits. We clone the key for each organization and track usage/disabled
   * status separately.
   */
  organizationId?: string;
  /** Set when key check returns a 401. */
  isRevoked: boolean;
  /** Set when key check returns a non-transient 429. */
  isOverQuota: boolean;
  /** The time at which this key was last rate limited. */
  rateLimitedAt: number;
  /**
   * Last known X-RateLimit-Requests-Reset header from OpenAI, converted to a
   * number.
   * Formatted as a `\d+(m|s)` string denoting the time until the limit resets.
   * Specifically, it seems to indicate the time until the key's quota will be
   * fully restored; the key may be usable before this time as the limit is a
   * rolling window.
   *
   * Requests which return a 429 do not count against the quota.
   *
   * Requests which fail for other reasons (e.g. 401) count against the quota.
   */
  rateLimitRequestsReset: number;
  /**
   * Last known X-RateLimit-Tokens-Reset header from OpenAI, converted to a
   * number.
   * Appears to follow the same format as `rateLimitRequestsReset`.
   *
   * Requests which fail do not count against the quota as they do not consume
   * tokens.
   */
  rateLimitTokensReset: number;
}

export type OpenAIKeyUpdate = Omit<
  Partial<OpenAIKey>,
  "key" | "hash" | "promptCount"
>;

/**
 * Upon assigning a key, we will wait this many milliseconds before allowing it
 * to be used again. This is to prevent the queue from flooding a key with too
 * many requests while we wait to learn whether previous ones succeeded.
 */
const KEY_REUSE_DELAY = 1000;

export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
  readonly service = "openai" as const;

  private keys: OpenAIKey[] = [];
  private checker?: OpenAIKeyChecker;
  private log = logger.child({ module: "key-provider", service: this.service });

  constructor() {
    const keyString = config.openaiKey?.trim();
    if (!keyString) {
      this.log.warn("OPENAI_KEY is not set. OpenAI API will not be available.");
      return;
    }
    let bareKeys: string[];
    bareKeys = keyString.split(",").map((k) => k.trim());
    bareKeys = [...new Set(bareKeys)];
    for (const k of bareKeys) {
      const newKey: OpenAIKey = {
        key: k,
        service: "openai" as const,
        modelFamilies: ["turbo" as const, "gpt4" as const],
        isTrial: false,
        isDisabled: false,
        isRevoked: false,
        isOverQuota: false,
        lastUsed: 0,
        lastChecked: 0,
        promptCount: 0,
        hash: `oai-${crypto
          .createHash("sha256")
          .update(k)
          .digest("hex")
          .slice(0, 8)}`,
        rateLimitedAt: 0,
        rateLimitRequestsReset: 0,
        rateLimitTokensReset: 0,
        turboTokens: 0,
        gpt4Tokens: 0,
        "gpt4-32kTokens": 0,
      };
      this.keys.push(newKey);
    }
    this.log.info({ keyCount: this.keys.length }, "Loaded OpenAI keys.");
  }

  public init() {
    if (config.checkKeys) {
      const cloneFn = this.clone.bind(this);
      const updateFn = this.update.bind(this);
      this.checker = new OpenAIKeyChecker(this.keys, cloneFn, updateFn);
      this.checker.start();
    }
  }

  /**
   * Returns a list of all keys, with the key field removed.
   * Don't mutate returned keys, use a KeyPool method instead.
   **/
  public list() {
    return this.keys.map((key) => {
      return Object.freeze({
        ...key,
        key: undefined,
      });
    });
  }

  public get(model: Model) {
    const neededFamily = getOpenAIModelFamily(model);
    const excludeTrials = model === "text-embedding-ada-002";

    const availableKeys = this.keys.filter(
      // Allow keys which
      (key) =>
        !key.isDisabled && // are not disabled
        key.modelFamilies.includes(neededFamily) && // have access to the model
        (!excludeTrials || !key.isTrial) // and are not trials (if applicable)
    );

    if (availableKeys.length === 0) {
      throw new Error(`No active keys available for ${neededFamily} models.`);
    }

    if (!config.allowedModelFamilies.includes(neededFamily)) {
      throw new Error(
        `Proxy operator has disabled access to ${neededFamily} models.`
      );
    }

    // Select a key, from highest priority to lowest priority:
    // 1. Keys which are not rate limited
    //    a. We ignore rate limits from >30 seconds ago
    //    b. If all keys were rate limited in the last minute, select the
    //       least recently rate limited key
    // 2. Keys which are trials
    // 3. Keys which do *not* have access to GPT-4-32k
    // 4. Keys which have not been used in the longest time

    const now = Date.now();
    const rateLimitThreshold = 30 * 1000;

    const keysByPriority = availableKeys.sort((a, b) => {
      // TODO: this isn't quite right; keys are briefly artificially rate-
      // limited when they are selected, so this will deprioritize keys that
      // may not actually be limited, simply because they were used recently.
      // This should be adjusted to use a new `rateLimitedUntil` field instead
      // of `rateLimitedAt`.
      const aRateLimited = now - a.rateLimitedAt < rateLimitThreshold;
      const bRateLimited = now - b.rateLimitedAt < rateLimitThreshold;

      if (aRateLimited && !bRateLimited) return 1;
      if (!aRateLimited && bRateLimited) return -1;
      if (aRateLimited && bRateLimited) {
        return a.rateLimitedAt - b.rateLimitedAt;
      }
      // Neither key is rate limited, continue

      if (a.isTrial && !b.isTrial) return -1;
      if (!a.isTrial && b.isTrial) return 1;
      // Neither or both keys are trials, continue

      const aHas32k = a.modelFamilies.includes("gpt4-32k");
      const bHas32k = b.modelFamilies.includes("gpt4-32k");
      if (aHas32k && !bHas32k) return 1;
      if (!aHas32k && bHas32k) return -1;
      // Neither or both keys have 32k, continue

      return a.lastUsed - b.lastUsed;
    });

    // logger.debug(
    //   {
    //     byPriority: keysByPriority.map((k) => ({
    //       hash: k.hash,
    //       isRateLimited: now - k.rateLimitedAt < rateLimitThreshold,
    //       modelFamilies: k.modelFamilies,
    //     })),
    //   },
    //   "Keys sorted by priority"
    // );

    const selectedKey = keysByPriority[0];
    selectedKey.lastUsed = now;

    // When a key is selected, we rate-limit it for a brief period of time to
    // prevent the queue processor from immediately flooding it with requests
    // while the initial request is still being processed (which is when we will
    // get new rate limit headers).
    // Instead, we will let a request through every second until the key
    // becomes fully saturated and locked out again.
    selectedKey.rateLimitedAt = now;
    selectedKey.rateLimitRequestsReset = KEY_REUSE_DELAY;
    return { ...selectedKey };
  }

  /** Called by the key checker to update key information. */
  public update(keyHash: string, update: OpenAIKeyUpdate) {
    const keyFromPool = this.keys.find((k) => k.hash === keyHash)!;
    Object.assign(keyFromPool, { lastChecked: Date.now(), ...update });
    // this.writeKeyStatus();
  }

  /** Called by the key checker to create clones of keys for the given orgs. */
  public clone(keyHash: string, newOrgIds: string[]) {
    const keyFromPool = this.keys.find((k) => k.hash === keyHash)!;
    const clones = newOrgIds.map((orgId) => {
      const clone: OpenAIKey = {
        ...keyFromPool,
        organizationId: orgId,
        isDisabled: false,
        hash: `oai-${crypto
          .createHash("sha256")
          .update(keyFromPool.key + orgId)
          .digest("hex")
          .slice(0, 8)}`,
        lastChecked: 0, // Force re-check in case the org has different models
      };
      this.log.info(
        { cloneHash: clone.hash, parentHash: keyFromPool.hash, orgId },
        "Cloned organization key"
      );
      return clone;
    });
    this.keys.push(...clones);
  }

  /** Disables a key, or does nothing if the key isn't in this pool. */
  public disable(key: Key) {
    const keyFromPool = this.keys.find((k) => k.hash === key.hash);
    if (!keyFromPool || keyFromPool.isDisabled) return;
    this.update(key.hash, { isDisabled: true });
    this.log.warn({ key: key.hash }, "Key disabled");
  }

  public available() {
    return this.keys.filter((k) => !k.isDisabled).length;
  }

  /**
   * Given a model, returns the period until a key will be available to service
   * the request, or returns 0 if a key is ready immediately.
   */
  public getLockoutPeriod(model: Model = "gpt-4"): number {
    const neededFamily = getOpenAIModelFamily(model);
    const activeKeys = this.keys.filter(
      (key) => !key.isDisabled && key.modelFamilies.includes(neededFamily)
    );

    if (activeKeys.length === 0) {
      // If there are no active keys for this model we can't fulfill requests.
      // We'll return 0 to let the request through and return an error,
      // otherwise the request will be stuck in the queue forever.
      return 0;
    }

    // A key is rate-limited if its `rateLimitedAt` plus the greater of its
    // `rateLimitRequestsReset` and `rateLimitTokensReset` is after the
    // current time.

    // If there are any keys that are not rate-limited, we can fulfill requests.
    const now = Date.now();
    const rateLimitedKeys = activeKeys.filter((key) => {
      const resetTime = Math.max(
        key.rateLimitRequestsReset,
        key.rateLimitTokensReset
      );
      return now < key.rateLimitedAt + resetTime;
    }).length;
    const anyNotRateLimited = rateLimitedKeys < activeKeys.length;

    if (anyNotRateLimited) {
      return 0;
    }

    // If all keys are rate-limited, return the time until the first key is
    // ready.
    return Math.min(
      ...activeKeys.map((key) => {
        const resetTime = Math.max(
          key.rateLimitRequestsReset,
          key.rateLimitTokensReset
        );
        return key.rateLimitedAt + resetTime - now;
      })
    );
  }

  public markRateLimited(keyHash: string) {
    this.log.debug({ key: keyHash }, "Key rate limited");
    const key = this.keys.find((k) => k.hash === keyHash)!;
    key.rateLimitedAt = Date.now();
  }

  public incrementUsage(keyHash: string, model: string, tokens: number) {
    const key = this.keys.find((k) => k.hash === keyHash);
    if (!key) return;
    key.promptCount++;
    key[`${getOpenAIModelFamily(model)}Tokens`] += tokens;
  }

  public updateRateLimits(keyHash: string, headers: http.IncomingHttpHeaders) {
    const key = this.keys.find((k) => k.hash === keyHash)!;
    const requestsReset = headers["x-ratelimit-reset-requests"];
    const tokensReset = headers["x-ratelimit-reset-tokens"];

    // Sometimes OpenAI only sends one of the two rate limit headers, it's
    // unclear why.

    if (requestsReset && typeof requestsReset === "string") {
      this.log.debug(
        { key: key.hash, requestsReset },
        `Updating rate limit requests reset time`
      );
      key.rateLimitRequestsReset = getResetDurationMillis(requestsReset);
    }

    if (tokensReset && typeof tokensReset === "string") {
      this.log.debug(
        { key: key.hash, tokensReset },
        `Updating rate limit tokens reset time`
      );
      key.rateLimitTokensReset = getResetDurationMillis(tokensReset);
    }

    if (!requestsReset && !tokensReset) {
      this.log.warn(
        { key: key.hash },
        `No rate limit headers in OpenAI response; skipping update`
      );
      return;
    }
  }

  public recheck() {
    this.keys.forEach((key) => {
      this.update(key.hash, {
        isRevoked: false,
        isOverQuota: false,
        isDisabled: false,
        lastChecked: 0,
      });
    });
    this.checker?.scheduleNextCheck();
  }

  /** Writes key status to disk. */
  // public writeKeyStatus() {
  //   const keys = this.keys.map((key) => ({
  //     key: key.key,
  //     isGpt4: key.isGpt4,
  //     usage: key.usage,
  //     hardLimit: key.hardLimit,
  //     isDisabled: key.isDisabled,
  //   }));
  //   fs.writeFileSync(
  //     path.join(__dirname, "..", "keys.json"),
  //     JSON.stringify(keys, null, 2)
  //   );
  // }
}

/**
 * Converts reset string ("21.0032s" or "21ms") to a number of milliseconds.
 * Result is clamped to 10s even though the API returns up to 60s, because the
 * API returns the time until the entire quota is reset, even if a key may be
 * able to fulfill requests before then due to partial resets.
 **/
function getResetDurationMillis(resetDuration?: string): number {
  const match = resetDuration?.match(/(\d+(\.\d+)?)(s|ms)/);
  if (match) {
    const [, time, , unit] = match;
    const value = parseFloat(time);
    const result = unit === "s" ? value * 1000 : value;
    return Math.min(result, 10000);
  }
  return 0;
}