improves keychecker stability with rate-limited trial keys

This commit is contained in:
nai-degen 2023-08-30 08:33:00 -05:00
parent 4d781e1720
commit e462ad585e
2 changed files with 60 additions and 31 deletions

View File

@ -139,19 +139,15 @@ export class OpenAIKeyChecker {
const updates = {
modelFamilies: provisionedModels,
isTrial: livenessTest.rateLimit <= 250,
softLimit: 0,
hardLimit: 0,
systemHardLimit: 0,
};
this.updateKey(key.hash, updates);
} else {
// Provisioned models don't change, so we don't need to check them again
// No updates needed as models and trial status generally don't change.
const [_livenessTest] = await Promise.all([this.testLiveness(key)]);
const updates = { softLimit: 0, hardLimit: 0, systemHardLimit: 0 };
this.updateKey(key.hash, updates);
this.updateKey(key.hash, {});
}
this.log.info(
{ key: key.hash, models: key.modelFamilies },
{ key: key.hash, models: key.modelFamilies, trial: key.isTrial },
"Key check complete."
);
} catch (error) {
@ -164,6 +160,10 @@ export class OpenAIKeyChecker {
// Only enqueue the next check if this wasn't a startup check, since those
// are batched together elsewhere.
if (!isInitialCheck) {
this.log.info(
{ key: key.hash },
"Recurring keychecks are disabled, no-op."
);
// this.scheduleNextCheck();
}
}
@ -192,7 +192,6 @@ export class OpenAIKeyChecker {
// update its `lastChecked` timestamp because we need to let the liveness
// check run before we can consider the key checked.
// Need to use `find` here because keys are cloned from the pool.
const keyFromPool = this.keys.find((k) => k.hash === key.hash)!;
this.updateKey(key.hash, {
modelFamilies: families,
@ -259,21 +258,32 @@ export class OpenAIKeyChecker {
});
break;
case "requests":
// Trial keys have extremely low requests-per-minute limits and we
// can often hit them just while checking the key, so we need to
// retry the check later to know if the key has quota remaining.
this.log.warn(
{ key: key.hash, error: data },
"Key is currently rate limited, so its liveness cannot be checked. Retrying in fifteen seconds."
);
// To trigger a shorter than usual delay before the next check, we
// will set its `lastChecked` to (NOW - (KEY_CHECK_PERIOD - 15s)).
// This will cause the usual key check scheduling logic to schedule
// the next check in 15 seconds. This also prevents the key from
// holding up startup checks for other keys.
const fifteenSeconds = 15 * 1000;
const next = Date.now() - (KEY_CHECK_PERIOD - fifteenSeconds);
this.updateKey(key.hash, { lastChecked: next });
// If we hit the text completion rate limit on a trial key, it is
// likely being used by many proxies. We will disable the key since
// it's just going to be constantly rate limited.
const isTrial =
Number(error.response.headers["x-ratelimit-limit-requests"]) <=
250;
if (isTrial) {
this.log.warn(
{ key: key.hash, error: data },
"Trial key is rate limited on text completion endpoint. This indicates the key is being used by several proxies at once and is not likely to be usable. Disabling key."
);
this.updateKey(key.hash, {
isTrial,
isDisabled: true,
isOverQuota: true,
modelFamilies: ["turbo"],
lastChecked: Date.now(),
});
} else {
this.log.warn(
{ key: key.hash, error: data },
"Non-trial key is rate limited on text completion endpoint. This is unusual and may indicate a bug. Assuming key is operational."
);
this.updateKey(key.hash, { lastChecked: Date.now() });
}
break;
case "tokens":
// Hitting a token rate limit, even on a trial key, actually implies
@ -321,8 +331,26 @@ export class OpenAIKeyChecker {
* We use the rate limit header to determine whether it's a trial key.
*/
private async testLiveness(key: OpenAIKey): Promise<{ rateLimit: number }> {
// What the hell this is doing:
// OpenAI enforces separate rate limits for chat and text completions. Trial
// keys have extremely low rate limits of 200 per day per API type. In order
// to avoid wasting more valuable chat quota, we send an (invalid) chat
// request to Babbage (a text completion model). Even though our request is
// to the chat endpoint, we get text rate limit headers back because the
// requested model determines the rate limit used, not the endpoint.
// Once we have headers, we can determine:
// 1. Is the key revoked? (401, OAI doesn't even validate the request)
// 2. Is the key out of quota? (400, OAI will still validate the request)
// 3. Is the key a trial key? (400, x-ratelimit-limit-requests: 200)
// This might still cause issues if too many proxies are running a train on
// the same trial key and even the text completion quota is exhausted, but
// it should work better than the alternative.
const payload = {
model: "gpt-3.5-turbo",
model: "babbage-002",
max_tokens: -1,
messages: [{ role: "user", content: "" }],
};

View File

@ -30,12 +30,6 @@ export interface OpenAIKey extends Key {
isRevoked: boolean;
/** Set when key check returns a non-transient 429. */
isOverQuota: boolean;
/** Threshold at which a warning email will be sent by OpenAI. */
softLimit: number;
/** Threshold at which the key will be disabled because it has reached the user-defined limit. */
hardLimit: number;
/** The maximum quota allocated to this key by OpenAI. */
systemHardLimit: number;
/** The time at which this key was last rate limited. */
rateLimitedAt: number;
/**
@ -359,7 +353,14 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
}
public recheck() {
this.keys.forEach((key) => (key.lastChecked = 0));
this.keys.forEach((key) => {
this.update(key.hash, {
isRevoked: false,
isOverQuota: false,
isDisabled: false,
lastChecked: 0,
});
});
this.checker?.scheduleNextCheck();
}