From fdd824f0e45161f459cda4791695c2c1584de44b Mon Sep 17 00:00:00 2001 From: nai-degen Date: Mon, 4 Dec 2023 01:23:55 -0600 Subject: [PATCH] adds azure rate limit auto-retry --- docs/azure-configuration.md | 7 +++- scripts/test-concurrency.js | 44 ++++++++++++++++++++ scripts/test_concurrency.ps1 | 40 ------------------ src/proxy/middleware/response/index.ts | 20 ++++++++- src/proxy/queue.ts | 16 ++++--- src/shared/key-management/azure/provider.ts | 7 +++- src/shared/key-management/openai/provider.ts | 9 ++-- 7 files changed, 88 insertions(+), 55 deletions(-) create mode 100644 scripts/test-concurrency.js delete mode 100644 scripts/test_concurrency.ps1 diff --git a/docs/azure-configuration.md b/docs/azure-configuration.md index 2ee172b..5f94581 100644 --- a/docs/azure-configuration.md +++ b/docs/azure-configuration.md @@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind- ``` ## Model assignment -Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model. +Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model. ### Supported model IDs Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID. As noted above, you can only use model IDs for which a deployment has been created and added to the proxy. + +## On content filtering +Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy. + +You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information. diff --git a/scripts/test-concurrency.js b/scripts/test-concurrency.js new file mode 100644 index 0000000..e5d1b33 --- /dev/null +++ b/scripts/test-concurrency.js @@ -0,0 +1,44 @@ +const axios = require("axios"); + +const concurrentRequests = 5; +const headers = { + Authorization: "Bearer test", + "Content-Type": "application/json", +}; + +const payload = { + model: "gpt-4", + max_tokens: 1, + stream: false, + messages: [{ role: "user", content: "Hi" }], +}; + +const makeRequest = async (i) => { + try { + const response = await axios.post( + "http://localhost:7860/proxy/azure/openai/v1/chat/completions", + payload, + { headers } + ); + console.log( + `Req ${i} finished with status code ${response.status} and response:`, + response.data + ); + } catch (error) { + console.error(`Error in req ${i}:`, error.message); + } +}; + +const executeRequestsConcurrently = () => { + const promises = []; + for (let i = 1; i <= concurrentRequests; i++) { + console.log(`Starting request ${i}`); + promises.push(makeRequest(i)); + } + + Promise.all(promises).then(() => { + console.log("All requests finished"); + }); +}; + +executeRequestsConcurrently(); diff --git a/scripts/test_concurrency.ps1 b/scripts/test_concurrency.ps1 deleted file mode 100644 index b802dbe..0000000 --- a/scripts/test_concurrency.ps1 +++ /dev/null @@ -1,40 +0,0 @@ -$NumThreads = 10 - -$runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads) -$runspacePool.Open() -$runspaces = @() - -$headers = @{ - "Authorization" = "Bearer test" - "anthropic-version" = "2023-01-01" - "Content-Type" = "application/json" -} - -$payload = @{ - model = "claude-v2" - max_tokens_to_sample = 40 - temperature = 0 - stream = $true - prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:" -} | ConvertTo-Json - -for ($i = 1; $i -le $NumThreads; $i++) { - Write-Host "Starting thread $i" - $runspace = [powershell]::Create() - $runspace.AddScript({ - param($i, $headers, $payload) - $response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload - Write-Host "Response from server: $($response.StatusCode)" - }).AddArgument($i).AddArgument($headers).AddArgument($payload) - - $runspace.RunspacePool = $runspacePool - $runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() } -} - -$runspaces | ForEach-Object { - $_.Pipe.EndInvoke($_.Status) - $_.Pipe.Dispose() -} - -$runspacePool.Close() -$runspacePool.Dispose() diff --git a/src/proxy/middleware/response/index.ts b/src/proxy/middleware/response/index.ts index 1dbfb0e..4fea396 100644 --- a/src/proxy/middleware/response/index.ts +++ b/src/proxy/middleware/response/index.ts @@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async ( case "aws": handleAwsRateLimitError(req, errorPayload); break; - case "google-palm": case "azure": + handleAzureRateLimitError(req, errorPayload); + break; + case "google-palm": errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`; break; default: @@ -507,6 +509,22 @@ function handleOpenAIRateLimitError( return errorPayload; } +function handleAzureRateLimitError( + req: Request, + errorPayload: ProxiedErrorPayload +) { + const code = errorPayload.error?.code; + switch (code) { + case "429": + keyPool.markRateLimited(req.key!); + reenqueueRequest(req); + throw new RetryableError("Rate-limited request re-enqueued."); + default: + errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`; + break; + } +} + const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => { if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) { const model = req.body.model; diff --git a/src/proxy/queue.ts b/src/proxy/queue.ts index a971e30..abe128c 100644 --- a/src/proxy/queue.ts +++ b/src/proxy/queue.ts @@ -15,6 +15,8 @@ import crypto from "crypto"; import type { Handler, Request } from "express"; import { keyPool } from "../shared/key-management"; import { + getAwsBedrockModelFamily, + getAzureOpenAIModelFamily, getClaudeModelFamily, getGooglePalmModelFamily, getOpenAIModelFamily, @@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily { // they should be treated as separate queues. const model = req.body.model ?? "gpt-3.5-turbo"; - // Weird special case for AWS because they serve multiple models from + // Weird special case for AWS/Azure because they serve multiple models from // different vendors, even if currently only one is supported. - if (req.service === "aws") { - return "aws-claude"; - } + if (req.service === "aws") return getAwsBedrockModelFamily(model); + if (req.service === "azure") return getAzureOpenAIModelFamily(model); switch (req.outboundApi) { case "anthropic": @@ -221,7 +222,11 @@ function processQueue() { reqs.filter(Boolean).forEach((req) => { if (req?.proceed) { - req.log.info({ retries: req.retryCount }, `Dequeuing request.`); + const modelFamily = getPartitionForRequest(req!); + req.log.info({ + retries: req.retryCount, + partition: modelFamily, + }, `Dequeuing request.`); req.proceed(); } }); @@ -415,6 +420,7 @@ function initStreaming(req: Request) { // Some clients have a broken SSE parser that doesn't handle comments // correctly. These clients can pass ?badSseParser=true to // disable comments in the SSE stream. + res.write(getHeartbeatPayload()); return; } diff --git a/src/shared/key-management/azure/provider.ts b/src/shared/key-management/azure/provider.ts index 5256b7e..aac5380 100644 --- a/src/shared/key-management/azure/provider.ts +++ b/src/shared/key-management/azure/provider.ts @@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider { // TODO: all of this shit is duplicate code - public getLockoutPeriod() { - const activeKeys = this.keys.filter((k) => !k.isDisabled); + public getLockoutPeriod(family: AzureOpenAIModelFamily) { + const activeKeys = this.keys.filter( + (key) => !key.isDisabled && key.modelFamilies.includes(family) + ); + // Don't lock out if there are no keys available or the queue will stall. // Just let it through so the add-key middleware can throw an error. if (activeKeys.length === 0) return 0; diff --git a/src/shared/key-management/openai/provider.ts b/src/shared/key-management/openai/provider.ts index db2055d..214ab1e 100644 --- a/src/shared/key-management/openai/provider.ts +++ b/src/shared/key-management/openai/provider.ts @@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider { (key) => !key.isDisabled && key.modelFamilies.includes(family) ); - if (activeKeys.length === 0) { - // If there are no active keys for this model we can't fulfill requests. - // We'll return 0 to let the request through and return an error, - // otherwise the request will be stuck in the queue forever. - return 0; - } + // Don't lock out if there are no keys available or the queue will stall. + // Just let it through so the add-key middleware can throw an error. + if (activeKeys.length === 0) return 0; // A key is rate-limited if its `rateLimitedAt` plus the greater of its // `rateLimitRequestsReset` and `rateLimitTokensReset` is after the