feat(router): new healthcheck that skips the queue (#244)

Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co>
2023-04-26 20:23:54 +02:00 · 2023-04-26 20:23:54 +02:00 · db2b4e0754
parent c4fb09f2ae
commit db2b4e0754
13 changed files with 265 additions and 105 deletions
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -67,7 +67,10 @@ jobs:
        run: |
          pip install pytest
          HF_HUB_ENABLE_HF_TRANSFER=1 pytest -sv server/tests
-      - name: Run Clippy
+      - name: Run Rust fmt
+        run: |
+          cargo fmt --check
+      - name: Run Rust clippy
        run: |
          cargo clippy
      - name: Run Rust tests
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -493,6 +493,7 @@ fn download_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), LauncherE
    Ok(())
 }

+#[allow(clippy::too_many_arguments)]
 fn spawn_shards(
    num_shard: usize,
    args: &Args,
@ -515,11 +516,11 @@ fn spawn_shards(
        let shutdown = shutdown.clone();
        let shutdown_sender = shutdown_sender.clone();
        let otlp_endpoint = args.otlp_endpoint.clone();
-        let quantize = args.quantize.clone();
-        let master_port = args.master_port.clone();
-        let disable_custom_kernels = args.disable_custom_kernels.clone();
-        let watermark_gamma = args.watermark_gamma.clone();
-        let watermark_delta = args.watermark_delta.clone();
+        let quantize = args.quantize;
+        let master_port = args.master_port;
+        let disable_custom_kernels = args.disable_custom_kernels;
+        let watermark_gamma = args.watermark_gamma;
+        let watermark_delta = args.watermark_delta;
        thread::spawn(move || {
            shard_manager(
                model_id,
@ -559,12 +560,12 @@ fn spawn_shards(
            }
            Ok(ShardStatus::Failed((rank, err))) => {
                tracing::error!("Shard {} failed to start:\n{}", rank, err);
-                shutdown_shards(shutdown, &shutdown_receiver);
+                shutdown_shards(shutdown, shutdown_receiver);
                return Err(LauncherError::ShardCannotStart);
            }
            Err(TryRecvError::Disconnected) => {
                tracing::error!("Shard status channel disconnected");
-                shutdown_shards(shutdown, &shutdown_receiver);
+                shutdown_shards(shutdown, shutdown_receiver);
                return Err(LauncherError::ShardDisconnected);
            }
        }
@ -666,7 +667,7 @@ fn spawn_webserver(
                tracing::error!("{}", err);
            }

-            shutdown_shards(shutdown, &shutdown_receiver);
+            shutdown_shards(shutdown, shutdown_receiver);
            return Err(LauncherError::WebserverCannotStart);
        }
    };
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -15,8 +15,13 @@ service TextGenerationService {
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
+    /// Health check
+    rpc Health (HealthRequest) returns (HealthResponse);
 }

+message HealthRequest {}
+message HealthResponse {}
+
 /// Empty request
 message InfoRequest {}

--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -7,7 +7,7 @@ use tonic::transport::{Channel, Uri};
 use tracing::instrument;

 /// Text Generation Inference gRPC client
-#[derive(Clone)]
+#[derive(Debug, Clone)]
 pub struct Client {
    stub: TextGenerationServiceClient<Channel>,
 }
@ -62,6 +62,14 @@ impl Client {
        Ok(response)
    }

+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@ -6,6 +6,7 @@ mod pb;
 mod sharded_client;

 pub use client::Client;
+pub use pb::generate::v1::HealthResponse;
 pub use pb::generate::v1::InfoResponse as ShardInfo;
 pub use pb::generate::v1::{
    Batch, FinishReason, GeneratedText, Generation, NextTokenChooserParameters, PrefillTokens,
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -1,10 +1,11 @@
 /// Multi shard Client
 use crate::Result;
-use crate::{Batch, Client, Generation, Request, ShardInfo};
+use crate::{Batch, Client, Generation, HealthResponse, Request, ShardInfo};
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;

+#[derive(Debug, Clone)]
 /// Text Generation Inference gRPC multi client
 pub struct ShardedClient {
    clients: Vec<Client>,
@ -48,6 +49,17 @@ impl ShardedClient {
        join_all(futures).await.pop().unwrap()
    }

+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
--- a/router/src/health.rs
+++ b/router/src/health.rs
@ -0,0 +1,62 @@
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use text_generation_client::{
+    Batch, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
+};
+
+#[derive(Clone, Debug)]
+pub(crate) struct Health {
+    client: ShardedClient,
+    generation_health: Arc<AtomicBool>,
+}
+
+impl Health {
+    pub(crate) fn new(client: ShardedClient, generation_health: Arc<AtomicBool>) -> Self {
+        Self {
+            client,
+            generation_health,
+        }
+    }
+
+    pub(crate) async fn check(&mut self) -> bool {
+        if self.generation_health.load(Ordering::SeqCst) {
+            // Generation is healthy, we only check that the shards are answering gRPC calls
+            self.client.health().await.is_ok()
+        } else {
+            // Generation is unhealthy or have not sent any generation request yet
+
+            // Dummy batch of 1 token and 1 generated token
+            let liveness_request = Request {
+                id: u64::MAX,
+                inputs: "liveness".to_string(),
+                truncate: 10,
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 1.0,
+                    top_k: 0,
+                    top_p: 1.0,
+                    typical_p: 1.0,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.0,
+                    watermark: false,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: 1,
+                    stop_sequences: vec![],
+                    ignore_eos_token: false,
+                }),
+            };
+            let batch = Batch {
+                id: u64::MAX,
+                requests: vec![liveness_request],
+                size: 1,
+                max_tokens: 2,
+            };
+            // Skips the queue
+            let value = self.client.prefill(batch).await.is_ok();
+            // Update generation health
+            self.generation_health.store(value, Ordering::SeqCst);
+            value
+        }
+    }
+}
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -7,7 +7,10 @@ use flume::SendError;
 use futures::future::try_join_all;
 use futures::stream::StreamExt;
 use nohash_hasher::IntMap;
-use std::sync::Arc;
+use std::sync::{
+    atomic::{AtomicBool, Ordering},
+    Arc,
+};
 use text_generation_client::{
    Batch, ClientError, GeneratedText, Generation, PrefillTokens, ShardedClient,
 };
@ -36,6 +39,7 @@ struct Shared {
 }

 impl Infer {
+    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        client: ShardedClient,
        validation: Validation,
@ -44,6 +48,7 @@ impl Infer {
        max_waiting_tokens: usize,
        max_concurrent_requests: usize,
        requires_padding: bool,
+        generation_health: Arc<AtomicBool>,
    ) -> Self {
        // Infer shared state
        let queue = Queue::new(requires_padding);
@ -59,6 +64,7 @@ impl Infer {
            max_waiting_tokens,
            queue.clone(),
            shared.clone(),
+            generation_health,
        ));

        // Inference limit with a semaphore
@ -240,6 +246,7 @@ async fn batching_task(
    max_waiting_tokens: usize,
    queue: Queue,
    shared: Arc<Shared>,
+    generation_health: Arc<AtomicBool>,
 ) {
    // Infinite loop
    loop {
@ -252,7 +259,7 @@ async fn batching_task(
        while let Some((mut entries, batch, span)) =
            queue.next_batch(None, max_batch_total_tokens).await
        {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries)
+            let mut cached_batch = prefill(&mut client, batch, &mut entries, &generation_health)
                .instrument(span)
                .await;
            let mut waiting_tokens = 1;
@ -301,7 +308,8 @@ async fn batching_task(
                    });

                    // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
+                    let new_cached_batch =
+                        prefill(&mut client, new_batch, &mut new_entries, &generation_health)
                            .instrument(span)
                            .await;
                    // Reset waiting counter
@ -327,7 +335,7 @@ async fn batching_task(
                    entry.temp_span = Some(entry_batch_span);
                });

-                cached_batch = decode(&mut client, batches, &mut entries)
+                cached_batch = decode(&mut client, batches, &mut entries, &generation_health)
                    .instrument(next_batch_span)
                    .await;
                waiting_tokens += 1;
@ -343,6 +351,7 @@ async fn prefill(
    client: &mut ShardedClient,
    batch: Batch,
    entries: &mut IntMap<u64, Entry>,
+    generation_health: &Arc<AtomicBool>,
 ) -> Option<Batch> {
    let start_time = Instant::now();
    let batch_id = batch.id;
@ -350,6 +359,8 @@ async fn prefill(

    match client.prefill(batch).await {
        Ok((generations, next_batch)) => {
+            // Update health
+            generation_health.store(true, Ordering::SeqCst);
            // Send generated tokens and filter stopped entries
            filter_send_generations(generations, entries);

@ -362,6 +373,8 @@ async fn prefill(
        }
        // If we have an error, we discard the whole batch
        Err(err) => {
+            // Update health
+            generation_health.store(false, Ordering::SeqCst);
            let _ = client.clear_cache(Some(batch_id)).await;
            send_errors(err, entries);
            metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
@ -375,6 +388,7 @@ async fn decode(
    client: &mut ShardedClient,
    batches: Vec<Batch>,
    entries: &mut IntMap<u64, Entry>,
+    generation_health: &Arc<AtomicBool>,
 ) -> Option<Batch> {
    let start_time = Instant::now();
    let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
@ -382,6 +396,8 @@ async fn decode(

    match client.decode(batches).await {
        Ok((generations, next_batch)) => {
+            // Update health
+            generation_health.store(true, Ordering::SeqCst);
            // Send generated tokens and filter stopped entries
            filter_send_generations(generations, entries);

@ -394,6 +410,7 @@ async fn decode(
        }
        // If we have an error, we discard the whole batch
        Err(err) => {
+            generation_health.store(false, Ordering::SeqCst);
            for id in batch_ids {
                let _ = client.clear_cache(Some(id)).await;
            }
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -1,3 +1,4 @@
+mod health;
 /// Text Generation Inference Webserver
 mod infer;
 mod queue;
@ -284,11 +285,15 @@ mod tests{

    pub(crate) async fn get_tokenizer() -> Tokenizer {
        if !std::path::Path::new("tokenizer.json").exists() {
-            let content = reqwest::get("https://huggingface.co/gpt2/raw/main/tokenizer.json").await.unwrap().bytes().await.unwrap();
+            let content = reqwest::get("https://huggingface.co/gpt2/raw/main/tokenizer.json")
+                .await
+                .unwrap()
+                .bytes()
+                .await
+                .unwrap();
            let mut file = std::fs::File::create("tokenizer.json").unwrap();
            file.write_all(&content).unwrap();
        }
        Tokenizer::from_file("tokenizer.json").unwrap()
    }
 }
-
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@ -141,7 +141,6 @@ impl State {

    // Get the next batch
    fn next_batch(&mut self, min_size: Option<usize>, token_budget: u32) -> Option<NextBatch> {
-
        if self.entries.is_empty() {
            return None;
        }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1,3 +1,4 @@
+use crate::health::Health;
 /// HTTP Server logic
 use crate::infer::{InferError, InferResponse, InferStreamResponse};
 use crate::validation::ValidationError;
@ -18,6 +19,8 @@ use futures::Stream;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use std::convert::Infallible;
 use std::net::SocketAddr;
+use std::sync::atomic::AtomicBool;
+use std::sync::Arc;
 use text_generation_client::{ShardInfo, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::signal;
@ -82,36 +85,29 @@ async fn get_model_info(info: Extension<Info>) -> Json<Info> {
    Json(info.0)
 }

+#[utoipa::path(
+    get,
+    tag = "Text Generation Inference",
+    path = "/health",
+    responses(
+        (status = 200, description = "Everything is working fine"),
+        (status = 503, description = "Text generation inference is down", body = ErrorResponse,
+            example = json ! ({"error": "unhealthy", "error_type": "healthcheck"})),
+    )
+)]
+#[instrument(skip(health))]
 /// Health check method
-#[instrument(skip(infer))]
-async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
-    // TODO: while this is the best health check we can do, it is a bit on the heavy side and might
-    //       be a bit too slow for a health check.
-    //       What we should do instead is check if the gRPC channels are still healthy.
-
-    // Send a small inference request
-    infer
-        .generate(GenerateRequest {
-            inputs: "liveness".to_string(),
-            parameters: GenerateParameters {
-                best_of: None,
-                temperature: None,
-                repetition_penalty: None,
-                top_k: None,
-                top_p: None,
-                typical_p: None,
-                do_sample: false,
-                max_new_tokens: 1,
-                return_full_text: None,
-                stop: Vec::new(),
-                truncate: None,
-                watermark: false,
-                details: false,
-                seed: None,
-            },
-        })
-        .await?;
-    Ok(())
+async fn health(mut health: Extension<Health>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
+    match health.check().await {
+        true => Ok(()),
+        false => Err((
+            StatusCode::SERVICE_UNAVAILABLE,
+            Json(ErrorResponse {
+                error: "unhealthy".to_string(),
+                error_type: "healthcheck".to_string(),
+            }),
+        )),
+    }
 }

 /// Generate tokens
@ -555,6 +551,8 @@ pub async fn run(
        max_input_length,
        max_total_tokens,
    );
+    let generation_health = Arc::new(AtomicBool::new(false));
+    let health_ext = Health::new(client.clone(), generation_health.clone());
    let infer = Infer::new(
        client,
        validation,
@ -563,6 +561,7 @@ pub async fn run(
        max_waiting_tokens,
        max_concurrent_requests,
        shard_info.requires_padding,
+        generation_health,
    );

    // Duration buckets
@ -657,6 +656,7 @@ pub async fn run(
        // Prometheus metrics route
        .route("/metrics", get(metrics))
        .layer(Extension(info))
+        .layer(Extension(health_ext))
        .layer(Extension(compat_return_full_text))
        .layer(Extension(infer))
        .layer(Extension(prom_handle))
@ -741,4 +741,3 @@ impl From<InferError> for Event {
            .unwrap()
    }
 }
-
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -393,12 +393,22 @@ mod tests{
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
-        let validation = Validation::new(workers, tokenizer, max_best_of, max_stop_sequence, max_input_length, max_total_tokens);
+        let validation = Validation::new(
+            workers,
+            tokenizer,
+            max_best_of,
+            max_stop_sequence,
+            max_input_length,
+            max_total_tokens,
+        );

        let max_new_tokens = 10;
-        match validation.validate_input("Hello".to_string(), None, max_new_tokens).await{
+        match validation
+            .validate_input("Hello".to_string(), None, max_new_tokens)
+            .await
+        {
            Err(ValidationError::MaxNewTokens(1, 10)) => (),
-            _ => panic!("Unexpected not max new tokens")
+            _ => panic!("Unexpected not max new tokens"),
        }
    }

@ -410,12 +420,22 @@ mod tests{
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
-        let validation = Validation::new(workers, tokenizer, max_best_of, max_stop_sequence, max_input_length, max_total_tokens);
+        let validation = Validation::new(
+            workers,
+            tokenizer,
+            max_best_of,
+            max_stop_sequence,
+            max_input_length,
+            max_total_tokens,
+        );

        let max_new_tokens = 10;
-        match validation.validate_input("Hello".to_string(), None, max_new_tokens).await{
+        match validation
+            .validate_input("Hello".to_string(), None, max_new_tokens)
+            .await
+        {
            Err(ValidationError::MaxTotalTokens(5, 1, 10)) => (),
-            _ => panic!("Unexpected not max new tokens")
+            _ => panic!("Unexpected not max new tokens"),
        }
    }

@ -427,19 +447,28 @@ mod tests{
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
-        let validation = Validation::new(workers, tokenizer, max_best_of, max_stop_sequence, max_input_length, max_total_tokens);
-        match validation.validate(GenerateRequest{
+        let validation = Validation::new(
+            workers,
+            tokenizer,
+            max_best_of,
+            max_stop_sequence,
+            max_input_length,
+            max_total_tokens,
+        );
+        match validation
+            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    best_of: Some(2),
                    do_sample: false,
                    ..default_parameters()
-            }
-        }).await{
+                },
+            })
+            .await
+        {
            Err(ValidationError::BestOfSampling) => (),
-            _ => panic!("Unexpected not best of sampling")
+            _ => panic!("Unexpected not best of sampling"),
        }
-
    }

    #[tokio::test]
@ -450,41 +479,55 @@ mod tests{
        let max_input_length = 4;
        let max_total_tokens = 5;
        let workers = 1;
-        let validation = Validation::new(workers, tokenizer, max_best_of, max_stop_sequence, max_input_length, max_total_tokens);
-        match validation.validate(GenerateRequest{
+        let validation = Validation::new(
+            workers,
+            tokenizer,
+            max_best_of,
+            max_stop_sequence,
+            max_input_length,
+            max_total_tokens,
+        );
+        match validation
+            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(1.0),
                    ..default_parameters()
-            }
-        }).await{
+                },
+            })
+            .await
+        {
            Err(ValidationError::TopP) => (),
-            _ => panic!("Unexpected top_p")
+            _ => panic!("Unexpected top_p"),
        }

-        match validation.validate(GenerateRequest{
+        match validation
+            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: Some(0.99),
                    max_new_tokens: 1,
                    ..default_parameters()
-            }
-        }).await{
+                },
+            })
+            .await
+        {
            Ok(_) => (),
-            _ => panic!("Unexpected top_p error")
+            _ => panic!("Unexpected top_p error"),
        }

-        let valid_request = validation.validate(GenerateRequest{
+        let valid_request = validation
+            .validate(GenerateRequest {
                inputs: "Hello".to_string(),
                parameters: GenerateParameters {
                    top_p: None,
                    max_new_tokens: 1,
                    ..default_parameters()
-            }
-        }).await.unwrap();
+                },
+            })
+            .await
+            .unwrap();
        // top_p == 1.0 is invalid for users to ask for but it's the default resolved value.
        assert_eq!(valid_request.parameters.top_p, 1.0);
-        
-
    }
 }
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -29,6 +29,11 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
    async def Info(self, request, context):
        return self.model.info

+    async def Health(self, request, context):
+        if self.model.device.type == "cuda":
+            torch.zeros((2, 2)).cuda()
+        return generate_pb2.HealthResponse()
+
    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)