adds azure rate limit auto-retry

This commit is contained in:
nai-degen 2023-12-04 01:23:55 -06:00
parent fbdea30264
commit fdd824f0e4
7 changed files with 88 additions and 55 deletions

View File

@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind-
```
## Model assignment
Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
### Supported model IDs
Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.
As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
## On content filtering
Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy.
You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information.

View File

@ -0,0 +1,44 @@
const axios = require("axios");
const concurrentRequests = 5;
const headers = {
Authorization: "Bearer test",
"Content-Type": "application/json",
};
const payload = {
model: "gpt-4",
max_tokens: 1,
stream: false,
messages: [{ role: "user", content: "Hi" }],
};
const makeRequest = async (i) => {
try {
const response = await axios.post(
"http://localhost:7860/proxy/azure/openai/v1/chat/completions",
payload,
{ headers }
);
console.log(
`Req ${i} finished with status code ${response.status} and response:`,
response.data
);
} catch (error) {
console.error(`Error in req ${i}:`, error.message);
}
};
const executeRequestsConcurrently = () => {
const promises = [];
for (let i = 1; i <= concurrentRequests; i++) {
console.log(`Starting request ${i}`);
promises.push(makeRequest(i));
}
Promise.all(promises).then(() => {
console.log("All requests finished");
});
};
executeRequestsConcurrently();

View File

@ -1,40 +0,0 @@
$NumThreads = 10
$runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads)
$runspacePool.Open()
$runspaces = @()
$headers = @{
"Authorization" = "Bearer test"
"anthropic-version" = "2023-01-01"
"Content-Type" = "application/json"
}
$payload = @{
model = "claude-v2"
max_tokens_to_sample = 40
temperature = 0
stream = $true
prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:"
} | ConvertTo-Json
for ($i = 1; $i -le $NumThreads; $i++) {
Write-Host "Starting thread $i"
$runspace = [powershell]::Create()
$runspace.AddScript({
param($i, $headers, $payload)
$response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload
Write-Host "Response from server: $($response.StatusCode)"
}).AddArgument($i).AddArgument($headers).AddArgument($payload)
$runspace.RunspacePool = $runspacePool
$runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() }
}
$runspaces | ForEach-Object {
$_.Pipe.EndInvoke($_.Status)
$_.Pipe.Dispose()
}
$runspacePool.Close()
$runspacePool.Dispose()

View File

@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async (
case "aws":
handleAwsRateLimitError(req, errorPayload);
break;
case "google-palm":
case "azure":
handleAzureRateLimitError(req, errorPayload);
break;
case "google-palm":
errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
break;
default:
@ -507,6 +509,22 @@ function handleOpenAIRateLimitError(
return errorPayload;
}
function handleAzureRateLimitError(
req: Request,
errorPayload: ProxiedErrorPayload
) {
const code = errorPayload.error?.code;
switch (code) {
case "429":
keyPool.markRateLimited(req.key!);
reenqueueRequest(req);
throw new RetryableError("Rate-limited request re-enqueued.");
default:
errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`;
break;
}
}
const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
const model = req.body.model;

View File

@ -15,6 +15,8 @@ import crypto from "crypto";
import type { Handler, Request } from "express";
import { keyPool } from "../shared/key-management";
import {
getAwsBedrockModelFamily,
getAzureOpenAIModelFamily,
getClaudeModelFamily,
getGooglePalmModelFamily,
getOpenAIModelFamily,
@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily {
// they should be treated as separate queues.
const model = req.body.model ?? "gpt-3.5-turbo";
// Weird special case for AWS because they serve multiple models from
// Weird special case for AWS/Azure because they serve multiple models from
// different vendors, even if currently only one is supported.
if (req.service === "aws") {
return "aws-claude";
}
if (req.service === "aws") return getAwsBedrockModelFamily(model);
if (req.service === "azure") return getAzureOpenAIModelFamily(model);
switch (req.outboundApi) {
case "anthropic":
@ -221,7 +222,11 @@ function processQueue() {
reqs.filter(Boolean).forEach((req) => {
if (req?.proceed) {
req.log.info({ retries: req.retryCount }, `Dequeuing request.`);
const modelFamily = getPartitionForRequest(req!);
req.log.info({
retries: req.retryCount,
partition: modelFamily,
}, `Dequeuing request.`);
req.proceed();
}
});
@ -415,6 +420,7 @@ function initStreaming(req: Request) {
// Some clients have a broken SSE parser that doesn't handle comments
// correctly. These clients can pass ?badSseParser=true to
// disable comments in the SSE stream.
res.write(getHeartbeatPayload());
return;
}

View File

@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
// TODO: all of this shit is duplicate code
public getLockoutPeriod() {
const activeKeys = this.keys.filter((k) => !k.isDisabled);
public getLockoutPeriod(family: AzureOpenAIModelFamily) {
const activeKeys = this.keys.filter(
(key) => !key.isDisabled && key.modelFamilies.includes(family)
);
// Don't lock out if there are no keys available or the queue will stall.
// Just let it through so the add-key middleware can throw an error.
if (activeKeys.length === 0) return 0;

View File

@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
(key) => !key.isDisabled && key.modelFamilies.includes(family)
);
if (activeKeys.length === 0) {
// If there are no active keys for this model we can't fulfill requests.
// We'll return 0 to let the request through and return an error,
// otherwise the request will be stuck in the queue forever.
return 0;
}
// Don't lock out if there are no keys available or the queue will stall.
// Just let it through so the add-key middleware can throw an error.
if (activeKeys.length === 0) return 0;
// A key is rate-limited if its `rateLimitedAt` plus the greater of its
// `rateLimitRequestsReset` and `rateLimitTokensReset` is after the