From f512021e77beb9b780c818b30daba58f1329ac11 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 19 Sep 2024 20:50:37 +0200
Subject: [PATCH 1/5] Stream options. (#2533)

* Stream options.

* Fetch stuff from nix integration test for easier testing.

* Adding the assert.

* Only send the usage when asked for.

* Update the docs.

* Impure test because we need network.

* develop.

* Optional usage.

* Fixes.

* Workflow
---
 .github/workflows/nix_tests.yaml              |   2 +-
 .github/workflows/tests.yaml                  |   1 +
 .redocly.lint-ignore.yaml                     |   2 +
 Cargo.toml                                    |   6 +-
 clients/python/text_generation/types.py       |   3 +-
 docs/openapi.json                             |  29 +++
 flake.lock                                    |  12 +-
 flake.nix                                     |  52 +++--
 ...t_flash_llama_completion_stream_usage.json | 206 ++++++++++++++++++
 .../models/test_completion_prompts.py         | 112 +++++++++-
 nix/client.nix                                |  21 ++
 router/src/lib.rs                             |  36 +++
 router/src/server.rs                          |  31 ++-
 13 files changed, 475 insertions(+), 38 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
 create mode 100644 nix/client.nix

diff --git a/.github/workflows/nix_tests.yaml b/.github/workflows/nix_tests.yaml
index 06768a7b..f2209f8a 100644
--- a/.github/workflows/nix_tests.yaml
+++ b/.github/workflows/nix_tests.yaml
@@ -38,4 +38,4 @@ jobs:
       env:
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
     - name: Rust tests.
-      run: nix build .#checks.$(nix eval --impure --raw --expr 'builtins.currentSystem').rust -L
+      run: nix develop .#test --command cargo test
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 5ad0fd6a..5f00180c 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -42,6 +42,7 @@ jobs:
           sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
       - name: Install
         run: |
+          sudo apt update
           sudo apt install python3.11-dev -y
           make install-cpu
       - name: Run server tests
diff --git a/.redocly.lint-ignore.yaml b/.redocly.lint-ignore.yaml
index 13b80497..fb02c00f 100644
--- a/.redocly.lint-ignore.yaml
+++ b/.redocly.lint-ignore.yaml
@@ -23,9 +23,11 @@ docs/openapi.json:
     - '#/components/schemas/GenerateResponse/properties/details/nullable'
     - '#/components/schemas/StreamResponse/properties/details/nullable'
     - '#/components/schemas/ChatRequest/properties/response_format/nullable'
+    - '#/components/schemas/ChatRequest/properties/stream_options/nullable'
     - '#/components/schemas/ChatRequest/properties/tool_choice/nullable'
     - '#/components/schemas/ToolChoice/nullable'
     - '#/components/schemas/ChatCompletionComplete/properties/logprobs/nullable'
+    - '#/components/schemas/ChatCompletionChunk/properties/usage/nullable'
     - '#/components/schemas/ChatCompletionChoice/properties/logprobs/nullable'
   no-invalid-media-type-examples:
     - '#/paths/~1/post/responses/422/content/application~1json/example'
diff --git a/Cargo.toml b/Cargo.toml
index a50bba24..ffd45f16 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,8 @@ members = [
   "backends/grpc-metadata",
   "backends/trtllm",
   "backends/client",
-  "launcher"
+  "launcher",
+  "router"
 ]
 default-members = [
   "benchmark",
@@ -13,7 +14,8 @@ default-members = [
   "backends/grpc-metadata",
   # "backends/trtllm",
   "backends/client",
-  "launcher"
+  "launcher",
+  "router"
 ]
 resolver = "2"
 
diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
index e36dd470..f7f823fc 100644
--- a/clients/python/text_generation/types.py
+++ b/clients/python/text_generation/types.py
@@ -168,7 +168,7 @@ class ChatCompletionComplete(BaseModel):
     # Log probabilities for the chat completion
     logprobs: Optional[Any]
     # Reason for completion
-    finish_reason: str
+    finish_reason: Optional[str]
     # Usage details of the chat completion
     usage: Optional[Any] = None
 
@@ -191,6 +191,7 @@ class ChatCompletionChunk(BaseModel):
     model: str
     system_fingerprint: str
     choices: List[Choice]
+    usage: Optional[Any] = None
 
 
 class Parameters(BaseModel):
diff --git a/docs/openapi.json b/docs/openapi.json
index 691705f2..f8de6564 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -742,6 +742,14 @@
           },
           "system_fingerprint": {
             "type": "string"
+          },
+          "usage": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/Usage"
+              }
+            ],
+            "nullable": true
           }
         }
       },
@@ -937,6 +945,14 @@
           "stream": {
             "type": "boolean"
           },
+          "stream_options": {
+            "allOf": [
+              {
+                "$ref": "#/components/schemas/StreamOptions"
+              }
+            ],
+            "nullable": true
+          },
           "temperature": {
             "type": "number",
             "format": "float",
@@ -1912,6 +1928,19 @@
           }
         }
       },
+      "StreamOptions": {
+        "type": "object",
+        "required": [
+          "include_usage"
+        ],
+        "properties": {
+          "include_usage": {
+            "type": "boolean",
+            "description": "If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
+            "example": "true"
+          }
+        }
+      },
       "StreamResponse": {
         "type": "object",
         "required": [
diff --git a/flake.lock b/flake.lock
index a6190789..8d0f4070 100644
--- a/flake.lock
+++ b/flake.lock
@@ -479,11 +479,11 @@
         "systems": "systems_6"
       },
       "locked": {
-        "lastModified": 1710146030,
-        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "lastModified": 1726560853,
+        "narHash": "sha256-X6rJYSESBVr3hBoH0WbKE5KvhPU5bloyZ2L4K60/fPQ=",
         "owner": "numtide",
         "repo": "flake-utils",
-        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "rev": "c1dfcf08411b08f6b8615f7d8971a2bfa81d5e8a",
         "type": "github"
       },
       "original": {
@@ -853,11 +853,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1726280639,
-        "narHash": "sha256-YfLRPlFZWrT2oRLNAoqf7G3+NnUTDdlIJk6tmBU7kXM=",
+        "lastModified": 1726626348,
+        "narHash": "sha256-sYV7e1B1yLcxo8/h+/hTwzZYmaju2oObNiy5iRI0C30=",
         "owner": "oxalica",
         "repo": "rust-overlay",
-        "rev": "e9f8641c92f26fd1e076e705edb12147c384171d",
+        "rev": "6fd52ad8bd88f39efb2c999cc971921c2fb9f3a2",
         "type": "github"
       },
       "original": {
diff --git a/flake.nix b/flake.nix
index 3d349ff2..07348e74 100644
--- a/flake.nix
+++ b/flake.nix
@@ -67,31 +67,38 @@
             '';
           };
         server = pkgs.python3.pkgs.callPackage ./nix/server.nix { inherit nix-filter; };
+        client = pkgs.python3.pkgs.callPackage ./nix/client.nix { };
       in
       {
         checks = {
-          rust = with pkgs; rustPlatform.buildRustPackage {
-            name = "rust-checks";
-            src = ./.;
-            cargoLock = {
-              lockFile = ./Cargo.lock;
+          rust =
+            with pkgs;
+            rustPlatform.buildRustPackage {
+              name = "rust-checks";
+              src = ./.;
+              cargoLock = {
+                lockFile = ./Cargo.lock;
+              };
+              buildInputs = [ openssl.dev ];
+              nativeBuildInputs = [
+                clippy
+                pkg-config
+                protobuf
+                python3
+                rustfmt
+              ];
+              buildPhase = ''
+                cargo check
+              '';
+              checkPhase = ''
+                cargo fmt -- --check
+                cargo test -j $NIX_BUILD_CORES
+                cargo clippy
+              '';
+              installPhase = "touch $out";
             };
-            buildInputs = [ openssl.dev ];
-            nativeBuildInputs = [ clippy pkg-config protobuf python3 rustfmt ];
-            buildPhase = ''
-              cargo check
-            '';
-            checkPhase = ''
-              cargo fmt -- --check
-              cargo test -j $NIX_BUILD_CORES
-              cargo clippy
-            '';
-            installPhase = "touch $out";
-          } ;
         };
-
         formatter = pkgs.nixfmt-rfc-style;
-
         devShells = with pkgs; rec {
           default = pure;
 
@@ -106,10 +113,11 @@
           test = mkShell {
             buildInputs =
               [
-                # benchmark
-                # launcher
-                # router
+                benchmark
+                launcher
+                router
                 server
+                client
                 openssl.dev
                 pkg-config
                 cargo
diff --git a/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
new file mode 100644
index 00000000..8c7be4cb
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_completion_prompts/test_flash_llama_completion_stream_usage.json
@@ -0,0 +1,206 @@
+[
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "**",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "Deep",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Learning",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": ":",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " An",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": " Overview",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656043,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "**\n",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "================================",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "=====",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": null,
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": null
+  },
+  {
+    "choices": [
+      {
+        "delta": {
+          "content": "\n\n",
+          "role": "assistant",
+          "tool_calls": null
+        },
+        "finish_reason": "length",
+        "index": 0,
+        "logprobs": null
+      }
+    ],
+    "created": 1726656044,
+    "id": "",
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "2.2.1-dev0-native",
+    "usage": {
+      "completion_tokens": 10,
+      "prompt_tokens": 40,
+      "total_tokens": 50
+    }
+  }
+]
diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py
index a3b6651d..6c359f1e 100644
--- a/integration-tests/models/test_completion_prompts.py
+++ b/integration-tests/models/test_completion_prompts.py
@@ -3,9 +3,7 @@ import requests
 import json
 from aiohttp import ClientSession
 
-from text_generation.types import (
-    Completion,
-)
+from text_generation.types import Completion, ChatCompletionChunk
 
 
 @pytest.fixture(scope="module")
@@ -50,6 +48,114 @@ def test_flash_llama_completion_single_prompt(
     assert response == response_snapshot
 
 
+@pytest.mark.release
+async def test_flash_llama_completion_stream_usage(
+    flash_llama_completion, response_snapshot
+):
+    url = f"{flash_llama_completion.base_url}/v1/chat/completions"
+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream_options": {"include_usage": True},
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+    assert chunks == response_snapshot
+
+    request = {
+        "model": "tgi",
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is Deep Learning?",
+            }
+        ],
+        "max_tokens": 10,
+        "temperature": 0.0,
+        "stream": True,
+    }
+    string = ""
+    chunks = []
+    had_usage = False
+    async with ClientSession(headers=flash_llama_completion.headers) as session:
+        async with session.post(url, json=request) as response:
+            # iterate over the stream
+            async for chunk in response.content.iter_any():
+                # remove "data:"
+                chunk = chunk.decode().split("\n\n")
+                # remove "data:" if present
+                chunk = [c.replace("data:", "") for c in chunk]
+                # remove empty strings
+                chunk = [c for c in chunk if c]
+                # remove completion marking chunk
+                chunk = [c for c in chunk if c != " [DONE]"]
+                # parse json
+                chunk = [json.loads(c) for c in chunk]
+
+                for c in chunk:
+                    chunks.append(ChatCompletionChunk(**c))
+                    assert "choices" in c
+                    if len(c["choices"]) == 1:
+                        index = c["choices"][0]["index"]
+                        assert index == 0
+                        string += c["choices"][0]["delta"]["content"]
+
+                        has_usage = c["usage"] is not None
+                        assert not had_usage
+                        if has_usage:
+                            had_usage = True
+                    else:
+                        raise RuntimeError("Expected different payload")
+    assert not had_usage
+    assert (
+        string
+        == "**Deep Learning: An Overview**\n=====================================\n\n"
+    )
+
+
 @pytest.mark.release
 def test_flash_llama_completion_many_prompts(flash_llama_completion, response_snapshot):
     response = requests.post(
diff --git a/nix/client.nix b/nix/client.nix
new file mode 100644
index 00000000..351fd08a
--- /dev/null
+++ b/nix/client.nix
@@ -0,0 +1,21 @@
+{
+  buildPythonPackage,
+  poetry-core,
+  huggingface-hub,
+  pydantic,
+}:
+
+buildPythonPackage {
+  name = "text-generation";
+
+  src = ../clients/python;
+
+  pyproject = true;
+
+  build-system = [ poetry-core ];
+
+  dependencies = [
+    huggingface-hub
+    pydantic
+  ];
+}
diff --git a/router/src/lib.rs b/router/src/lib.rs
index d8029c72..ad8924df 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -684,6 +684,7 @@ pub(crate) struct ChatCompletionChunk {
     pub model: String,
     pub system_fingerprint: String,
     pub choices: Vec<ChatCompletionChoice>,
+    pub usage: Option<Usage>,
 }
 
 #[derive(Clone, Serialize, ToSchema)]
@@ -732,6 +733,7 @@ impl ChatCompletionChunk {
         created: u64,
         logprobs: Option<ChatCompletionLogprobs>,
         finish_reason: Option<String>,
+        usage: Option<Usage>,
     ) -> Self {
         let delta = match (delta, tool_calls) {
             (Some(delta), _) => ChatCompletionDelta::Chat(TextMessage {
@@ -766,6 +768,7 @@ impl ChatCompletionChunk {
                 logprobs,
                 finish_reason,
             }],
+            usage,
         }
     }
 }
@@ -880,6 +883,18 @@ pub(crate) struct ChatRequest {
     #[serde(default)]
     #[schema(nullable = true, default = "null", example = "null")]
     pub guideline: Option<String>,
+
+    /// Options for streaming response. Only set this when you set stream: true.
+    #[serde(default)]
+    #[schema(nullable = true, example = "null")]
+    pub stream_options: Option<StreamOptions>,
+}
+
+#[derive(Clone, Deserialize, ToSchema, Serialize)]
+struct StreamOptions {
+    /// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
+    #[schema(example = "true")]
+    include_usage: bool,
 }
 
 pub fn default_tool_prompt() -> String {
@@ -1472,6 +1487,27 @@ mod tests {
         let textmsg: TextMessage = message.into();
         assert_eq!(textmsg.content, "Whats in this image?![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rabbit.png)");
     }
+
+    #[test]
+    fn test_chat_stream_options() {
+        let json = json!({
+            "model": "",
+            "stream_options": {"include_usage": true},
+            "messages": [{
+                "role": "user",
+                "content": "Hello"
+            }]
+        });
+        let request: ChatRequest = serde_json::from_str(json.to_string().as_str()).unwrap();
+
+        assert!(matches!(
+            request.stream_options,
+            Some(StreamOptions {
+                include_usage: true
+            })
+        ));
+    }
+
     #[test]
     fn openai_output() {
         let message = OutputMessage::ChatMessage(TextMessage {
diff --git a/router/src/server.rs b/router/src/server.rs
index 9cec2aaa..32c86e0f 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -13,8 +13,8 @@ use crate::{
     usage_stats, BestOfSequence, Details, ErrorResponse, FinishReason, FunctionName,
     GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
     HubProcessorConfig, HubTokenizerConfig, Info, Message, MessageChunk, MessageContent,
-    OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamResponse, TextMessage, Token,
-    TokenizeResponse, ToolCallDelta, ToolCallMessage, Url, Usage, Validation,
+    OutputMessage, PrefillToken, SimpleToken, StreamDetails, StreamOptions, StreamResponse,
+    TextMessage, Token, TokenizeResponse, ToolCallDelta, ToolCallMessage, Url, Usage, Validation,
 };
 use crate::{
     ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
@@ -1175,6 +1175,7 @@ async fn chat_completions(
         seed,
         stop,
         stream,
+        stream_options,
         tools,
         tool_choice,
         tool_prompt,
@@ -1265,6 +1266,28 @@ async fn chat_completions(
                 (content, None)
             };
 
+            let (usage, finish_reason) = match stream_token.details {
+                Some(details) => {
+                    let usage = if stream_options
+                        .as_ref()
+                        .map(|s| s.include_usage)
+                        .unwrap_or(false)
+                    {
+                        let completion_tokens = details.generated_tokens;
+                        let prompt_tokens = details.input_length;
+                        let total_tokens = prompt_tokens + completion_tokens;
+                        Some(Usage {
+                            completion_tokens,
+                            prompt_tokens,
+                            total_tokens,
+                        })
+                    } else {
+                        None
+                    };
+                    (usage, Some(details.finish_reason.format(true)))
+                }
+                None => (None, None),
+            };
             event
                 .json_data(CompletionType::ChatCompletionChunk(
                     ChatCompletionChunk::new(
@@ -1274,7 +1297,8 @@ async fn chat_completions(
                         tool_calls,
                         current_time,
                         logprobs,
-                        stream_token.details.map(|d| d.finish_reason.format(true)),
+                        finish_reason,
+                        usage,
                     ),
                 ))
                 .unwrap_or_else(|e| {
@@ -1664,6 +1688,7 @@ StreamDetails,
 ErrorResponse,
 GrammarType,
 Usage,
+StreamOptions,
 DeltaToolCall,
 ToolType,
 Tool,

From c1037601721904337876d139f8fb2d5cf2ba227e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Sep 2024 22:16:32 +0200
Subject: [PATCH 2/5] Update to moe-kenels 0.3.1 (#2535)

* Update to moe-kenels 0.3.1

* Attempt to fix apt failure
---
 flake.lock            |  6 +++---
 server/poetry.lock    | 24 ++++++++++++------------
 server/pyproject.toml |  8 ++++----
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/flake.lock b/flake.lock
index 8d0f4070..d811be5e 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,11 +978,11 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1726229792,
-        "narHash": "sha256-9xsLmjc9nr7a4PTddKv2DOi82ompTtJNyjO6R67y5tE=",
+        "lastModified": 1726743157,
+        "narHash": "sha256-7OczwJsA47o+aUftMwkoh8R31DlNSl2FgRjqE8zAggk=",
         "owner": "danieldk",
         "repo": "tgi-nix",
-        "rev": "1a902f4818e94c3f8d95f6000db17bc3fadd0ce7",
+        "rev": "bcc9fd01cf81bc42cebb999a736a377adfa8942f",
         "type": "github"
       },
       "original": {
diff --git a/server/poetry.lock b/server/poetry.lock
index eeb22204..8d0e31f8 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1244,12 +1244,12 @@ files = [
 
 [[package]]
 name = "moe-kernels"
-version = "0.2.2"
+version = "0.3.1"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.2.2+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:d268d818932ddcbca9bc71021dc63b008aae832827a7c0484cf206bd59cfc9ab"},
+    {file = "moe_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:b679984a53807127f25af053ec0a2c07dec97ec196f76363a8bfdc3fbb3d1a9a"},
 ]
 
 [package.dependencies]
@@ -1259,16 +1259,16 @@ triton = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
 
 [[package]]
 name = "moe-kernels"
-version = "0.2.2"
+version = "0.3.1"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.2.2+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614bbc3f41b707b0c40372f0bb00e218ad0842d306f90bef28ce8e98e7fcb7cb"},
+    {file = "moe_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:29684f81495f6e032085295c86d160022f03d5d9a9981446f311ca94fbbbc2cd"},
 ]
 
 [package.dependencies]
@@ -1278,16 +1278,16 @@ triton = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
 
 [[package]]
 name = "moe-kernels"
-version = "0.2.2"
+version = "0.3.1"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.2.2+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:c2f48ed541353be03157d4015270dff797f7b7b8a664babdcbdf7414867d5abd"},
+    {file = "moe_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:9dfdbef48b5b7e97912aaa7420b1b694876a3281f5edfe7d4ca9a69e1f48bff2"},
 ]
 
 [package.dependencies]
@@ -1297,16 +1297,16 @@ triton = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
 
 [[package]]
 name = "moe-kernels"
-version = "0.2.2"
+version = "0.3.1"
 description = "MoE kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "moe_kernels-0.2.2+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:d5f0339b73426c422872f7ff060433df6cd8e881451baf85ee7454e0e905f9d8"},
+    {file = "moe_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:f7d0fc8f191c905a668f3d2eb889999ee988048d08bfd7062d64bca3876588ae"},
 ]
 
 [package.dependencies]
@@ -1316,7 +1316,7 @@ triton = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
 
 [[package]]
 name = "mpmath"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 6eee1e72..6bdd2385 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -47,10 +47,10 @@ marlin-kernels = [
   { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.2.2/moe_kernels-0.2.2+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 rich = "^13.7.1"
 

From abd24dd38593e5a973dd3828b5a292c9aa2b2b09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Sep 2024 22:17:15 +0200
Subject: [PATCH 3/5] doc: clarify that `--quantize` is not needed for
 pre-quantized models (#2536)

---
 docs/source/reference/launcher.md | 4 +++-
 flake.nix                         | 1 +
 launcher/src/main.rs              | 6 +++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index 01f15648..c8d2a4c6 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -55,7 +55,9 @@ Options:
 ## QUANTIZE
 ```shell
       --quantize <QUANTIZE>
-          Whether you want the model to be quantized
+          Quantization method to use for the model. It is not necessary to specify this option for pre-quantized models, since the quantization method is read from the model configuration.
+          
+          Marlin kernels will be used automatically for GPTQ/AWQ models.
           
           [env: QUANTIZE=]
 
diff --git a/flake.nix b/flake.nix
index 07348e74..260b2554 100644
--- a/flake.nix
+++ b/flake.nix
@@ -157,6 +157,7 @@
                 pyright
                 pytest
                 pytest-asyncio
+                redocly
                 ruff
                 syrupy
               ]);
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 2cdccfe0..175244ff 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -367,7 +367,11 @@ struct Args {
     #[clap(long, env)]
     num_shard: Option<usize>,
 
-    /// Whether you want the model to be quantized.
+    /// Quantization method to use for the model. It is not necessary to specify this option
+    /// for pre-quantized models, since the quantization method is read from the model
+    /// configuration.
+    ///
+    /// Marlin kernels will be used automatically for GPTQ/AWQ models.
     #[clap(long, env, value_enum)]
     quantize: Option<Quantization>,
 

From f478aa77ade6d3f2baca72ae148afb73ef5bf748 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 20 Sep 2024 16:02:55 +0800
Subject: [PATCH 4/5] hotfix: ipex fails since cuda moe kernel is not supported
 (#2532)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../models/custom_modeling/flash_deepseek_v2_modeling.py      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
index 12be08cd..328f239b 100644
--- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -15,7 +15,6 @@
 
 from typing import List, Optional, Tuple
 
-from moe_kernels.fused_moe import grouped_topk
 import torch
 import torch.distributed
 from text_generation_server.layers import (
@@ -41,6 +40,9 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 
+if SYSTEM != "ipex":
+    from moe_kernels.fused_moe import grouped_topk
+
 if SYSTEM == "rocm":
     try:
         from vllm import _custom_C

From 64e981fdcf08c1750b75593777aa50d65bfe6a6f Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 24 Sep 2024 10:53:19 +0000
Subject: [PATCH 5/5] fix issue for sliding window models

---
 server/text_generation_server/layers/attention/common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/server/text_generation_server/layers/attention/common.py b/server/text_generation_server/layers/attention/common.py
index 855f4dfc..d6e512c0 100644
--- a/server/text_generation_server/layers/attention/common.py
+++ b/server/text_generation_server/layers/attention/common.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import ATTENTION
 import torch
 from typing import Optional
@@ -65,5 +66,7 @@ else:
         max_k: int
 
         def clamp(self, max):
+            if SYSTEM == "rocm":
+                return self
             raise NotImplementedError("Not implemented seqlen for paged")
             return Seqlen(torch.clamp(self.input_lengths, max=max))