Support chat response format (#2046)

* feat: support response_format in chat * fix: adjust typos * fix: add trufflehog lint
2024-06-11 10:44:56 -04:00 · 2024-06-11 10:44:56 -04:00 · 376a0b7ada
parent a6e4d63c86
commit 376a0b7ada
5 changed files with 156 additions and 7 deletions
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -16,4 +16,3 @@ jobs:
        fetch-depth: 0
    - name: Secret Scanning
      uses: trufflesecurity/trufflehog@main
--- a/integration-tests/models/snapshots/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
+++ b/integration-tests/models/snapshots/test_grammar_response_format_llama/test_grammar_response_format_llama_json.json
@ -0,0 +1,23 @@
 {
  "choices": [
    {
      "finish_reason": "eos_token",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "{\n  \"temperature\": [\n    35,\n    34,\n    36\n  ],\n  \"unit\": \"°c\"\n}",
        "role": "assistant"
      }
    }
  ],
  "created": 1718044128,
  "id": "",
  "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
  "object": "text_completion",
  "system_fingerprint": "2.0.5-dev0-native",
  "usage": {
    "completion_tokens": 39,
    "prompt_tokens": 136,
    "total_tokens": 175
  }
 }
--- a/integration-tests/models/test_grammar_response_format_llama.py
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@ -0,0 +1,101 @@
 import pytest
 import requests
 from pydantic import BaseModel
 from typing import List
@pytest.fixture(scope="module")
 def llama_grammar_handle(launcher):
    with launcher(
        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
        num_shard=1,
        disable_grammar_support=False,
        use_flash_attention=False,
        max_batch_prefill_tokens=3000,
    ) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def llama_grammar(llama_grammar_handle):
    await llama_grammar_handle.health(300)
    return llama_grammar_handle.client
@pytest.mark.asyncio
 async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot):
    class Weather(BaseModel):
        unit: str
        temperature: List[int]
    # send the request
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
        headers=llama_grammar.headers,
        json={
            "model": "tgi",
            "messages": [
                {
                    "role": "system",
                    "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}",
                },
                {
                    "role": "user",
                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
                },
            ],
            "seed": 42,
            "max_tokens": 500,
            "response_format": {"type": "json_object", "value": Weather.schema()},
        },
    )
    chat_completion = response.json()
    called = chat_completion["choices"][0]["message"]["content"]
    assert response.status_code == 200
    assert (
        called
        == '{\n  "temperature": [\n    35,\n    34,\n    36\n  ],\n  "unit": "°c"\n}'
    )
    assert chat_completion == response_snapshot
@pytest.mark.asyncio
 async def test_grammar_response_format_llama_error_if_tools_not_installed(
    llama_grammar,
 ):
    class Weather(BaseModel):
        unit: str
        temperature: List[int]
    # send the request
    response = requests.post(
        f"{llama_grammar.base_url}/v1/chat/completions",
        headers=llama_grammar.headers,
        json={
            "model": "tgi",
            "messages": [
                {
                    "role": "system",
                    "content": f"Respond to the users questions and answer them in the following format: {Weather.schema()}",
                },
                {
                    "role": "user",
                    "content": "What's the weather like the next 3 days in San Francisco, CA?",
                },
            ],
            "seed": 42,
            "max_tokens": 500,
            "tools": [],
            "response_format": {"type": "json_object", "value": Weather.schema()},
        },
    )
    # 422 means the server was unable to process the request because it contains invalid data.
    assert response.status_code == 422
    assert response.json() == {
        "error": "Grammar and tools are mutually exclusive",
        "error_type": "grammar and tools",
    }
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -89,6 +89,7 @@ pub(crate) enum GrammarType {
    /// JSON Schema is a declarative language that allows to annotate JSON documents
    /// with types and descriptions.
    #[serde(rename = "json")]
    #[serde(alias = "json_object")]
    #[schema(example = json ! ({"properties": {"location":{"type": "string"}}}))]
    Json(serde_json::Value),
    #[serde(rename = "regex")]
@ -791,6 +792,13 @@ pub(crate) struct ChatRequest {
    #[schema(nullable = true, example = "null")]
    #[serde(deserialize_with = "deserialize_tool_choice::deserialize")]
    pub tool_choice: Option<ToolType>,
    /// Response format constraints for the generation.
    ///
    /// NOTE: A request can use `response_format` OR `tools` but not both.
    #[serde(default)]
    #[schema(nullable = true, default = "null", example = "null")]
    pub response_format: Option<GrammarType>,
 }
 fn default_tool_prompt() -> Option<String> {
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1016,6 +1016,7 @@ async fn chat_completions(
        tool_choice,
        tool_prompt,
        temperature,
        response_format,
        ..
    } = req;
@ -1030,6 +1031,18 @@ async fn chat_completions(
        other => (true, other),
    };
    // response_format and tools are mutually exclusive
    if response_format.is_some() && tools.as_ref().is_some() {
        metrics::increment_counter!("tgi_request_failure", "err" => "validation");
        return Err((
            StatusCode::UNPROCESSABLE_ENTITY,
            Json(ErrorResponse {
                error: "Grammar and tools are mutually exclusive".to_string(),
                error_type: "grammar and tools".to_string(),
            }),
        ));
    }
    // extract tool grammar if present
    let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
        Ok(grammar) => grammar,
@ -1046,16 +1059,21 @@ async fn chat_completions(
        }
    };
-    let grammar_with_prompt = tool_grammar
+    // determine the appropriate arguments for apply_chat_template
    let tools_grammar_prompt = tool_grammar
        .as_ref()
        .map(|t| (GrammarType::Json(serde_json::json!(t)), tool_prompt));
-    let typed_grammar = grammar_with_prompt
+    let (tools_grammar_prompt, grammar) = match response_format {
-        .as_ref()
+        Some(response_format) => (None, Some(response_format)),
-        .map(|(grammar, _)| grammar.clone());
+        None => (
            tools_grammar_prompt.clone(),
            tools_grammar_prompt.map(|(grammar, _)| grammar.clone()),
        ),
    };
    // apply chat template to flatten the request into a single input
-    let inputs = match infer.apply_chat_template(messages, grammar_with_prompt) {
+    let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
        Ok(inputs) => inputs,
        Err(err) => {
            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
@ -1091,7 +1109,7 @@ async fn chat_completions(
            decoder_input_details: !stream,
            seed,
            top_n_tokens: req.top_logprobs,
-            grammar: typed_grammar,
+            grammar,
        },
    };