adds azure rate limit auto-retry
This commit is contained in:
parent
fbdea30264
commit
fdd824f0e4
|
@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind-
|
|||
```
|
||||
|
||||
## Model assignment
|
||||
Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
|
||||
Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
|
||||
|
||||
### Supported model IDs
|
||||
Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.
|
||||
|
||||
As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
|
||||
|
||||
## On content filtering
|
||||
Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy.
|
||||
|
||||
You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information.
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
const axios = require("axios");
|
||||
|
||||
const concurrentRequests = 5;
|
||||
const headers = {
|
||||
Authorization: "Bearer test",
|
||||
"Content-Type": "application/json",
|
||||
};
|
||||
|
||||
const payload = {
|
||||
model: "gpt-4",
|
||||
max_tokens: 1,
|
||||
stream: false,
|
||||
messages: [{ role: "user", content: "Hi" }],
|
||||
};
|
||||
|
||||
const makeRequest = async (i) => {
|
||||
try {
|
||||
const response = await axios.post(
|
||||
"http://localhost:7860/proxy/azure/openai/v1/chat/completions",
|
||||
payload,
|
||||
{ headers }
|
||||
);
|
||||
console.log(
|
||||
`Req ${i} finished with status code ${response.status} and response:`,
|
||||
response.data
|
||||
);
|
||||
} catch (error) {
|
||||
console.error(`Error in req ${i}:`, error.message);
|
||||
}
|
||||
};
|
||||
|
||||
const executeRequestsConcurrently = () => {
|
||||
const promises = [];
|
||||
for (let i = 1; i <= concurrentRequests; i++) {
|
||||
console.log(`Starting request ${i}`);
|
||||
promises.push(makeRequest(i));
|
||||
}
|
||||
|
||||
Promise.all(promises).then(() => {
|
||||
console.log("All requests finished");
|
||||
});
|
||||
};
|
||||
|
||||
executeRequestsConcurrently();
|
|
@ -1,40 +0,0 @@
|
|||
$NumThreads = 10
|
||||
|
||||
$runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads)
|
||||
$runspacePool.Open()
|
||||
$runspaces = @()
|
||||
|
||||
$headers = @{
|
||||
"Authorization" = "Bearer test"
|
||||
"anthropic-version" = "2023-01-01"
|
||||
"Content-Type" = "application/json"
|
||||
}
|
||||
|
||||
$payload = @{
|
||||
model = "claude-v2"
|
||||
max_tokens_to_sample = 40
|
||||
temperature = 0
|
||||
stream = $true
|
||||
prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:"
|
||||
} | ConvertTo-Json
|
||||
|
||||
for ($i = 1; $i -le $NumThreads; $i++) {
|
||||
Write-Host "Starting thread $i"
|
||||
$runspace = [powershell]::Create()
|
||||
$runspace.AddScript({
|
||||
param($i, $headers, $payload)
|
||||
$response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload
|
||||
Write-Host "Response from server: $($response.StatusCode)"
|
||||
}).AddArgument($i).AddArgument($headers).AddArgument($payload)
|
||||
|
||||
$runspace.RunspacePool = $runspacePool
|
||||
$runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() }
|
||||
}
|
||||
|
||||
$runspaces | ForEach-Object {
|
||||
$_.Pipe.EndInvoke($_.Status)
|
||||
$_.Pipe.Dispose()
|
||||
}
|
||||
|
||||
$runspacePool.Close()
|
||||
$runspacePool.Dispose()
|
|
@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async (
|
|||
case "aws":
|
||||
handleAwsRateLimitError(req, errorPayload);
|
||||
break;
|
||||
case "google-palm":
|
||||
case "azure":
|
||||
handleAzureRateLimitError(req, errorPayload);
|
||||
break;
|
||||
case "google-palm":
|
||||
errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
|
||||
break;
|
||||
default:
|
||||
|
@ -507,6 +509,22 @@ function handleOpenAIRateLimitError(
|
|||
return errorPayload;
|
||||
}
|
||||
|
||||
function handleAzureRateLimitError(
|
||||
req: Request,
|
||||
errorPayload: ProxiedErrorPayload
|
||||
) {
|
||||
const code = errorPayload.error?.code;
|
||||
switch (code) {
|
||||
case "429":
|
||||
keyPool.markRateLimited(req.key!);
|
||||
reenqueueRequest(req);
|
||||
throw new RetryableError("Rate-limited request re-enqueued.");
|
||||
default:
|
||||
errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
|
||||
if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
|
||||
const model = req.body.model;
|
||||
|
|
|
@ -15,6 +15,8 @@ import crypto from "crypto";
|
|||
import type { Handler, Request } from "express";
|
||||
import { keyPool } from "../shared/key-management";
|
||||
import {
|
||||
getAwsBedrockModelFamily,
|
||||
getAzureOpenAIModelFamily,
|
||||
getClaudeModelFamily,
|
||||
getGooglePalmModelFamily,
|
||||
getOpenAIModelFamily,
|
||||
|
@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily {
|
|||
// they should be treated as separate queues.
|
||||
const model = req.body.model ?? "gpt-3.5-turbo";
|
||||
|
||||
// Weird special case for AWS because they serve multiple models from
|
||||
// Weird special case for AWS/Azure because they serve multiple models from
|
||||
// different vendors, even if currently only one is supported.
|
||||
if (req.service === "aws") {
|
||||
return "aws-claude";
|
||||
}
|
||||
if (req.service === "aws") return getAwsBedrockModelFamily(model);
|
||||
if (req.service === "azure") return getAzureOpenAIModelFamily(model);
|
||||
|
||||
switch (req.outboundApi) {
|
||||
case "anthropic":
|
||||
|
@ -221,7 +222,11 @@ function processQueue() {
|
|||
|
||||
reqs.filter(Boolean).forEach((req) => {
|
||||
if (req?.proceed) {
|
||||
req.log.info({ retries: req.retryCount }, `Dequeuing request.`);
|
||||
const modelFamily = getPartitionForRequest(req!);
|
||||
req.log.info({
|
||||
retries: req.retryCount,
|
||||
partition: modelFamily,
|
||||
}, `Dequeuing request.`);
|
||||
req.proceed();
|
||||
}
|
||||
});
|
||||
|
@ -415,6 +420,7 @@ function initStreaming(req: Request) {
|
|||
// Some clients have a broken SSE parser that doesn't handle comments
|
||||
// correctly. These clients can pass ?badSseParser=true to
|
||||
// disable comments in the SSE stream.
|
||||
res.write(getHeartbeatPayload());
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {
|
|||
|
||||
// TODO: all of this shit is duplicate code
|
||||
|
||||
public getLockoutPeriod() {
|
||||
const activeKeys = this.keys.filter((k) => !k.isDisabled);
|
||||
public getLockoutPeriod(family: AzureOpenAIModelFamily) {
|
||||
const activeKeys = this.keys.filter(
|
||||
(key) => !key.isDisabled && key.modelFamilies.includes(family)
|
||||
);
|
||||
|
||||
// Don't lock out if there are no keys available or the queue will stall.
|
||||
// Just let it through so the add-key middleware can throw an error.
|
||||
if (activeKeys.length === 0) return 0;
|
||||
|
|
|
@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
|
|||
(key) => !key.isDisabled && key.modelFamilies.includes(family)
|
||||
);
|
||||
|
||||
if (activeKeys.length === 0) {
|
||||
// If there are no active keys for this model we can't fulfill requests.
|
||||
// We'll return 0 to let the request through and return an error,
|
||||
// otherwise the request will be stuck in the queue forever.
|
||||
return 0;
|
||||
}
|
||||
// Don't lock out if there are no keys available or the queue will stall.
|
||||
// Just let it through so the add-key middleware can throw an error.
|
||||
if (activeKeys.length === 0) return 0;
|
||||
|
||||
// A key is rate-limited if its `rateLimitedAt` plus the greater of its
|
||||
// `rateLimitRequestsReset` and `rateLimitTokensReset` is after the
|
||||
|
|
Loading…
Reference in New Issue