adds azure rate limit auto-retry

2023-12-04 01:23:55 -06:00 · 2023-12-04 01:23:55 -06:00 · fdd824f0e4
parent fbdea30264
commit fdd824f0e4
7 changed files with 88 additions and 55 deletions
--- a/docs/azure-configuration.md
+++ b/docs/azure-configuration.md
@ -17,9 +17,14 @@ AZURE_CREDENTIALS=contoso-ml:gpt4-8k:0123456789abcdef0123456789abcdef,northwind-
 ```

 ## Model assignment
-Note that each Azure deployment is assigned a model when you create it in the Microsoft Cognitive Services portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.
+Note that each Azure deployment is assigned a model when you create it in the Azure OpenAI Service portal. If you want to use a different model, you'll need to create a new deployment, and therefore a new key to be added to the AZURE_CREDENTIALS environment variable. Each credential only grants access to one model.

 ### Supported model IDs
 Users can send normal OpenAI model IDs to the proxy to invoke the corresponding models. For the most part they work the same with Azure. GPT-3.5 Turbo has an ID of "gpt-35-turbo" because Azure doesn't allow periods in model names, but the proxy should automatically convert this to the correct ID.

 As noted above, you can only use model IDs for which a deployment has been created and added to the proxy.
+
+## On content filtering
+Be aware that all Azure OpenAI Service deployments have content filtering enabled by default at a Medium level. Prompts or responses which are deemed to be inappropriate will be rejected by the API. This is a feature of the Azure OpenAI Service and not the proxy.
+
+You can disable this from deployment's settings within Azure, but you would need to request an exemption from Microsoft for your organization first. See [this page](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/content-filters) for more information.
--- a/scripts/test-concurrency.js
+++ b/scripts/test-concurrency.js
@ -0,0 +1,44 @@
+const axios = require("axios");
+
+const concurrentRequests = 5;
+const headers = {
+  Authorization: "Bearer test",
+  "Content-Type": "application/json",
+};
+
+const payload = {
+  model: "gpt-4",
+  max_tokens: 1,
+  stream: false,
+  messages: [{ role: "user", content: "Hi" }],
+};
+
+const makeRequest = async (i) => {
+  try {
+    const response = await axios.post(
+      "http://localhost:7860/proxy/azure/openai/v1/chat/completions",
+      payload,
+      { headers }
+    );
+    console.log(
+      `Req ${i} finished with status code ${response.status} and response:`,
+      response.data
+    );
+  } catch (error) {
+    console.error(`Error in req ${i}:`, error.message);
+  }
+};
+
+const executeRequestsConcurrently = () => {
+  const promises = [];
+  for (let i = 1; i <= concurrentRequests; i++) {
+    console.log(`Starting request ${i}`);
+    promises.push(makeRequest(i));
+  }
+
+  Promise.all(promises).then(() => {
+    console.log("All requests finished");
+  });
+};
+
+executeRequestsConcurrently();
--- a/scripts/test_concurrency.ps1
+++ b/scripts/test_concurrency.ps1
@ -1,40 +0,0 @@
-$NumThreads = 10
-
-$runspacePool = [runspacefactory]::CreateRunspacePool(1, $NumThreads)
-$runspacePool.Open()
-$runspaces = @()
-
-$headers = @{
-    "Authorization" = "Bearer test"
-    "anthropic-version" = "2023-01-01"
-    "Content-Type" = "application/json"
-}
-
-$payload = @{
-    model = "claude-v2"
-    max_tokens_to_sample = 40
-    temperature = 0
-    stream = $true
-    prompt = "Test prompt, please reply with lorem ipsum`n`n:Assistant:"
-} | ConvertTo-Json
-
-for ($i = 1; $i -le $NumThreads; $i++) {
-    Write-Host "Starting thread $i"
-    $runspace = [powershell]::Create()
-    $runspace.AddScript({
-        param($i, $headers, $payload)
-        $response = Invoke-WebRequest -Uri "http://localhost:7860/proxy/aws/claude/v1/complete" -Method Post -Headers $headers -Body $payload
-        Write-Host "Response from server: $($response.StatusCode)"
-    }).AddArgument($i).AddArgument($headers).AddArgument($payload)
-
-    $runspace.RunspacePool = $runspacePool
-    $runspaces += [PSCustomObject]@{ Pipe = $runspace; Status = $runspace.BeginInvoke() }
-}
-
-$runspaces | ForEach-Object {
-    $_.Pipe.EndInvoke($_.Status)
-    $_.Pipe.Dispose()
-}
-
-$runspacePool.Close()
-$runspacePool.Dispose()
--- a/src/proxy/middleware/response/index.ts
+++ b/src/proxy/middleware/response/index.ts
@ -343,8 +343,10 @@ const handleUpstreamErrors: ProxyResHandlerWithBody = async (
      case "aws":
        handleAwsRateLimitError(req, errorPayload);
        break;
-      case "google-palm":
      case "azure":
+        handleAzureRateLimitError(req, errorPayload);
+        break;
+      case "google-palm":
        errorPayload.proxy_note = `Automatic rate limit retries are not supported for this service. Try again in a few seconds.`;
        break;
      default:
@ -507,6 +509,22 @@ function handleOpenAIRateLimitError(
  return errorPayload;
 }

+function handleAzureRateLimitError(
+  req: Request,
+  errorPayload: ProxiedErrorPayload
+) {
+  const code = errorPayload.error?.code;
+  switch (code) {
+    case "429":
+      keyPool.markRateLimited(req.key!);
+      reenqueueRequest(req);
+      throw new RetryableError("Rate-limited request re-enqueued.");
+    default:
+      errorPayload.proxy_note = `Unrecognized rate limit error from Azure (${code}). Please report this.`;
+      break;
+  }
+}
+
 const incrementUsage: ProxyResHandlerWithBody = async (_proxyRes, req) => {
  if (isTextGenerationRequest(req) || isImageGenerationRequest(req)) {
    const model = req.body.model;
--- a/src/proxy/queue.ts
+++ b/src/proxy/queue.ts
@ -15,6 +15,8 @@ import crypto from "crypto";
 import type { Handler, Request } from "express";
 import { keyPool } from "../shared/key-management";
 import {
+  getAwsBedrockModelFamily,
+  getAzureOpenAIModelFamily,
  getClaudeModelFamily,
  getGooglePalmModelFamily,
  getOpenAIModelFamily,
@ -136,11 +138,10 @@ function getPartitionForRequest(req: Request): ModelFamily {
  // they should be treated as separate queues.
  const model = req.body.model ?? "gpt-3.5-turbo";

-  // Weird special case for AWS because they serve multiple models from
+  // Weird special case for AWS/Azure because they serve multiple models from
  // different vendors, even if currently only one is supported.
-  if (req.service === "aws") {
-    return "aws-claude";
-  }
+  if (req.service === "aws") return getAwsBedrockModelFamily(model);
+  if (req.service === "azure") return getAzureOpenAIModelFamily(model);

  switch (req.outboundApi) {
    case "anthropic":
@ -221,7 +222,11 @@ function processQueue() {

  reqs.filter(Boolean).forEach((req) => {
    if (req?.proceed) {
-      req.log.info({ retries: req.retryCount }, `Dequeuing request.`);
+      const modelFamily = getPartitionForRequest(req!);
+      req.log.info({
+        retries: req.retryCount,
+        partition: modelFamily,
+      }, `Dequeuing request.`);
      req.proceed();
    }
  });
@ -415,6 +420,7 @@ function initStreaming(req: Request) {
    // Some clients have a broken SSE parser that doesn't handle comments
    // correctly. These clients can pass ?badSseParser=true to
    // disable comments in the SSE stream.
+    res.write(getHeartbeatPayload());
    return;
  }

--- a/src/shared/key-management/azure/provider.ts
+++ b/src/shared/key-management/azure/provider.ts
@ -157,8 +157,11 @@ export class AzureOpenAIKeyProvider implements KeyProvider<AzureOpenAIKey> {

  // TODO: all of this shit is duplicate code

-  public getLockoutPeriod() {
-    const activeKeys = this.keys.filter((k) => !k.isDisabled);
+  public getLockoutPeriod(family: AzureOpenAIModelFamily) {
+    const activeKeys = this.keys.filter(
+      (key) => !key.isDisabled && key.modelFamilies.includes(family)
+    );
+
    // Don't lock out if there are no keys available or the queue will stall.
    // Just let it through so the add-key middleware can throw an error.
    if (activeKeys.length === 0) return 0;
--- a/src/shared/key-management/openai/provider.ts
+++ b/src/shared/key-management/openai/provider.ts
@ -276,12 +276,9 @@ export class OpenAIKeyProvider implements KeyProvider<OpenAIKey> {
      (key) => !key.isDisabled && key.modelFamilies.includes(family)
    );

-    if (activeKeys.length === 0) {
-      // If there are no active keys for this model we can't fulfill requests.
-      // We'll return 0 to let the request through and return an error,
-      // otherwise the request will be stuck in the queue forever.
-      return 0;
-    }
+    // Don't lock out if there are no keys available or the queue will stall.
+    // Just let it through so the add-key middleware can throw an error.
+    if (activeKeys.length === 0) return 0;

    // A key is rate-limited if its `rateLimitedAt` plus the greater of its
    // `rateLimitRequestsReset` and `rateLimitTokensReset` is after the