From 376a0b7ada91548a68798383cb008ea01c728b30 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Tue, 11 Jun 2024 10:44:56 -0400
Subject: [PATCH] Support chat response format (#2046)

* feat: support response_format in chat

* fix: adjust typos

* fix: add trufflehog lint
---
 .github/workflows/trufflehog.yml              |   1 -
 ...st_grammar_response_format_llama_json.json |  23 ++++
 .../test_grammar_response_format_llama.py     | 101 ++++++++++++++++++
 router/src/lib.rs                             |   8 ++
 router/src/server.rs                          |  30 ++++--
 5 files changed, 156 insertions(+), 7 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
 create mode 100644 integration-tests/models/test_grammar_response_format_llama.py

diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
index 8bc60eff..b406d43b 100644
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -16,4 +16,3 @@ jobs:
         fetch-depth: 0
     - name: Secret Scanning
       uses: trufflesecurity/trufflehog@main
-
diff --git a/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
new file mode 100644
index 00000000..83390832
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
@@ -0,0 +1,23 @@
+{
+  "choices": [
+    {
+      "finish_reason": "eos_token",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "{\n  \"temperature\": [\n    35,\n    34,\n    36\n  ],\n  \"unit\": \"°c\"\n}",
+        "role": "assistant"
+      }
+    }
+  ],
+  "created": 1718044128,
+  "id": "",
+  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "object": "text_completion",
+  "system_fingerprint": "2.0.5-dev0-native",
+  "usage": {
+    "completion_tokens": 39,
+    "prompt_tokens": 136,
+    "total_tokens": 175
+  }
+}
diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py
new file mode 100644
index 00000000..9c4c048e
--- /dev/null
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@@ -0,0 +1,101 @@
+import pytest
+import requests
+from pydantic import BaseModel
+from typing import List
+
+
+@pytest.fixture(scope="module")
+def llama_grammar_handle(launcher):
+    with launcher(
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        num_shard=1,
+        disable_grammar_support=False,
+        use_flash_attention=False,
+        max_batch_prefill_tokens=3000,
+    ) as handle:
+        yield handle
+
+
+@pytest.fixture(scope="module")
+async def llama_grammar(llama_grammar_handle):
+    await llama_grammar_handle.health(300)
+    return llama_grammar_handle.client
+
+
+@pytest.mark.asyncio
+async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot):
+
+    class Weather(BaseModel):
+        unit: str
+        temperature: List[int]
+
+    # send the request
+    response = requests.post(
+        f"{llama_grammar.base_url}/v1/chat/completions",
+        headers=llama_grammar.headers,
+        json={
+            "model": "tgi",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}",
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
+                },
+            ],
+            "seed": 42,
+            "max_tokens": 500,
+            "response_format": {"type": "json_object", "value": Weather.schema()},
+        },
+    )
+
+    chat_completion = response.json()
+    called = chat_completion["choices"][0]["message"]["content"]
+
+    assert response.status_code == 200
+    assert (
+        called
+        == '{\n  "temperature": [\n    35,\n    34,\n    36\n  ],\n  "unit": "°c"\n}'
+    )
+    assert chat_completion == response_snapshot
+
+
+@pytest.mark.asyncio
+async def test_grammar_response_format_llama_error_if_tools_not_installed(
+    llama_grammar,
+):
+    class Weather(BaseModel):
+        unit: str
+        temperature: List[int]
+
+    # send the request
+    response = requests.post(
+        f"{llama_grammar.base_url}/v1/chat/completions",
+        headers=llama_grammar.headers,
+        json={
+            "model": "tgi",
+            "messages": [
+                {
+                    "role": "system",
+                    "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}",
+                },
+                {
+                    "role": "user",
+                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
+                },
+            ],
+            "seed": 42,
+            "max_tokens": 500,
+            "tools": [],
+            "response_format": {"type": "json_object", "value": Weather.schema()},
+        },
+    )
+
+    # 422 means the server was unable to process the request because it contains invalid data.
+    assert response.status_code == 422
+    assert response.json() == {
+        "error": "Grammar and tools are mutually exclusive",
+        "error_type": "grammar and tools",
+    }
diff --git a/router/src/lib.rs b/router/src/lib.rs
index b6902c49..1016019d 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -89,6 +89,7 @@ pub(crate) enum GrammarType {
     /// JSON Schema is a declarative language that allows to annotate JSON documents
     /// with types and descriptions.
     #[serde(rename = "json")]
+    #[serde(alias = "json_object")]
     #[schema(example = json ! ({"properties": {"location":{"type": "string"}}}))]
     Json(serde_json::Value),
     #[serde(rename = "regex")]
@@ -791,6 +792,13 @@ pub(crate) struct ChatRequest {
     #[schema(nullable = true, example = "null")]
     #[serde(deserialize_with = "deserialize_tool_choice::deserialize")]
     pub tool_choice: Option<ToolType>,
+
+    /// Response format constraints for the generation.
+    ///
+    /// NOTE: A request can use `response_format` OR `tools` but not both.
+    #[serde(default)]
+    #[schema(nullable = true, default = "null", example = "null")]
+    pub response_format: Option<GrammarType>,
 }
 
 fn default_tool_prompt() -> Option<String> {
diff --git a/router/src/server.rs b/router/src/server.rs
index 30479b0e..e3c2c4f9 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -1016,6 +1016,7 @@ async fn chat_completions(
         tool_choice,
         tool_prompt,
         temperature,
+        response_format,
         ..
     } = req;
 
@@ -1030,6 +1031,18 @@ async fn chat_completions(
         other => (true, other),
     };
 
+    // response_format and tools are mutually exclusive
+    if response_format.is_some() && tools.as_ref().is_some() {
+        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+        return Err((
+            StatusCode::UNPROCESSABLE_ENTITY,
+            Json(ErrorResponse {
+                error: "Grammar and tools are mutually exclusive".to_string(),
+                error_type: "grammar and tools".to_string(),
+            }),
+        ));
+    }
+
     // extract tool grammar if present
     let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
         Ok(grammar) => grammar,
@@ -1046,16 +1059,21 @@ async fn chat_completions(
         }
     };
 
-    let grammar_with_prompt = tool_grammar
+    // determine the appropriate arguments for apply_chat_template
+    let tools_grammar_prompt = tool_grammar
         .as_ref()
         .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt));
 
-    let typed_grammar = grammar_with_prompt
-        .as_ref()
-        .map(|(grammar, _)| grammar.clone());
+    let (tools_grammar_prompt, grammar) = match response_format {
+        Some(response_format) => (None, Some(response_format)),
+        None => (
+            tools_grammar_prompt.clone(),
+            tools_grammar_prompt.map(|(grammar, _)| grammar.clone()),
+        ),
+    };
 
     // apply chat template to flatten the request into a single input
-    let inputs = match infer.apply_chat_template(messages, grammar_with_prompt) {
+    let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
         Ok(inputs) => inputs,
         Err(err) => {
             metrics::increment_counter!("tgi_request_failure", "err" => "validation");
@@ -1091,7 +1109,7 @@ async fn chat_completions(
             decoder_input_details: !stream,
             seed,
             top_n_tokens: req.top_logprobs,
-            grammar: typed_grammar,
+            grammar,
         },
     };