Add stop parameter to completions route

2024-05-07 19:27:05 +02:00 · 2024-05-07 19:27:05 +02:00 · 2f644779cb
parent 9fb1cdc8d5
commit 2f644779cb
3 changed files with 34 additions and 7 deletions
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -1121,6 +1121,15 @@
            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
            "example": 0.95,
            "nullable": true
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "description": "Up to 4 sequences where the API will stop generating further tokens.",
            "example": "null",
            "nullable": true
          }
        }
      },
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -401,6 +401,11 @@ pub struct CompletionRequest {
    #[serde(default)]
    #[schema(example = "1.0")]
    pub frequency_penalty: Option<f32>,
    /// Up to 4 sequences where the API will stop generating further tokens.
    #[serde(default)]
    #[schema(nullable = true, example = "null")]
    pub stop: Option<Vec<String>>,
 }
 #[derive(Clone, Deserialize, Serialize, ToSchema, Default)]
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -597,9 +597,22 @@ async fn completions(
    let span = tracing::Span::current();
    metrics::increment_counter!("tgi_request_count");
-    let stream = req.stream;
+	let CompletionRequest {
-    let max_new_tokens = req.max_tokens.or(Some(100));
+        max_tokens,
-    let seed = req.seed;
+        seed,
        stop,
        stream,
        temperature,
        ..
    } = req;
    let max_new_tokens = max_tokens.or(Some(100));
    let stop = stop.unwrap_or_default();
    // enable greedy only when temperature is 0
    let (do_sample, temperature) = match temperature {
        Some(temperature) if temperature == 0.0 => (false, None),
        other => (true, other),
    };
    // if suffix is present throw an error
    if req.suffix.is_some() {
@ -629,22 +642,22 @@ async fn completions(
    }
    let generate_requests: Vec<GenerateRequest> = req
-        .prompt
+    	.prompt
        .iter()
        .map(|prompt| GenerateRequest {
            inputs: prompt.to_string(),
            parameters: GenerateParameters {
                best_of: None,
-                temperature: req.temperature,
+                temperature: temperature,
                repetition_penalty: req.repetition_penalty,
                frequency_penalty: req.frequency_penalty,
                top_k: None,
                top_p: req.top_p,
                typical_p: None,
-                do_sample: true,
+                do_sample,
                max_new_tokens,
                return_full_text: None,
-                stop: Vec::new(),
+                stop: stop.clone(),
                truncate: None,
                watermark: false,
                details: true,