diff --git a/router/src/lib.rs b/router/src/lib.rs index a97b9b50..64f0fafa 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -611,6 +611,11 @@ pub(crate) struct ChatRequest { #[schema(nullable = true, example = 0.1)] pub presence_penalty: Option, + /// Up to 4 sequences where the API will stop generating further tokens. + #[serde(default)] + #[schema(nullable = true, example = "null")] + pub stop: Option>, + #[serde(default = "bool::default")] pub stream: bool, diff --git a/router/src/server.rs b/router/src/server.rs index 9c956a73..61aacd0b 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -763,6 +763,7 @@ async fn chat_completions( .map(|x| x + 2.0); let logprobs = req.logprobs.unwrap_or(false); let seed = req.seed; + let stop = req.stop.unwrap_or_default(); // apply chat template to flatten the request into a single input let mut inputs = match infer.apply_chat_template(req.messages) { @@ -850,7 +851,7 @@ async fn chat_completions( do_sample: true, max_new_tokens, return_full_text: None, - stop: Vec::new(), + stop, truncate: None, watermark: false, details: true,