From 23fba672e8156ce19cc518470a5452a9543c56b9 Mon Sep 17 00:00:00 2001 From: Lucain Date: Sat, 16 Mar 2024 17:14:29 +0100 Subject: [PATCH] Fix index in ChatCompletionChunk (#1648) Fix a small inconsistency compared the OpenAI's chat-completion behavior (introduced in https://github.com/huggingface/text-generation-inference/pull/1427 cc @drbh). When using `stream=True`, each chunk has an `index` value in `ChatCompletionChoice`. This index is not meant to be the index of the generated token but the index of the choice, which is always 0 (since TGI always return a single choice). See https://platform.openai.com/docs/api-reference/chat/object: > index _integer_ > The index of the choice in the list of choices. --- So instead of ```js data:{"id":"","object":"text_completion","created":1710508199,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":1,"delta":{"role":"assistant","content":"I"},"logprobs":null,"finish_reason":null}]} data:{"id":"","object":"text_completion","created":1710508199,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":2,"delta":{"role":"assistant","content":"'"},"logprobs":null,"finish_reason":null}]} data:{"id":"","object":"text_completion","created":1710508199,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":3,"delta":{"role":"assistant","content":"m"},"logprobs":null,"finish_reason":"length"}]} ``` if should return ```js data:{"id":"","object":"text_completion","created":1710508199,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"I"},"logprobs":null,"finish_reason":null}]} data:{"id":"","object":"text_completion","created":1710508199,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"'"},"logprobs":null,"finish_reason":null}]} data:{"id":"","object":"text_completion","created":1710508199,"model":"HuggingFaceH4/zephyr-7b-beta","system_fingerprint":"1.4.3-sha-e6bb3ff","choices":[{"index":0,"delta":{"role":"assistant","content":"m"},"logprobs":null,"finish_reason":"length"}]} ``` **EDIT:** I also edited ToolCall.index to be always `0` (instead of the generated token index) but for this one I'm actually unsure. It might be the index of the tool in the array of tools? OpenAI's documentation doesn't provide any information about it: > index _integer_ --- I also noticed that in OpenAI's example, the last chunk doesn't have a delta and is the only one that has a `finish_reason` returning. TGI is slightly different since the last chunk has both the last delta (i.e. the last generated token) + the finish reason. I don't think this is worth fixing since it is not a requirement according to the docs/specs (at least not that I know of). --- router/src/lib.rs | 5 ++--- router/src/server.rs | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/router/src/lib.rs b/router/src/lib.rs index 64f0fafa..50d2cbf4 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -524,7 +524,6 @@ impl ChatCompletionChunk { delta: Option, tool_calls: Option>, created: u64, - index: u32, logprobs: Option, finish_reason: Option, ) -> Self { @@ -535,12 +534,12 @@ impl ChatCompletionChunk { model, system_fingerprint, choices: vec![ChatCompletionChoice { - index, + index: 0, delta: ChatCompletionDelta { role: "assistant".to_string(), content: delta, tool_calls: tool_calls.map(|tc| DeltaToolCall { - index, + index: 0, id: String::new(), r#type: "function".to_string(), function: Function { diff --git a/router/src/server.rs b/router/src/server.rs index 61aacd0b..c8e6017a 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -895,7 +895,6 @@ async fn chat_completions( content, tool_calls, current_time, - stream_token.index, logprobs, stream_token.details.map(|d| d.finish_reason.to_string()), ))