feat(router): add device and dtype info (#215)

2023-04-21 15:36:29 +02:00 · 2023-04-21 15:36:29 +02:00 · 343437c7b5
parent ac8c0f6fe4
commit 343437c7b5
21 changed files with 120 additions and 15 deletions
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -3,6 +3,8 @@ syntax = "proto3";
 package generate.v1;
 service TextGenerationService {
    /// Model Info
    rpc Info (InfoRequest) returns (InfoResponse) {}
    /// Service discovery
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
    /// Empties batch cache
@ -13,6 +15,15 @@ service TextGenerationService {
    rpc Decode (DecodeRequest) returns (DecodeResponse);
 }
 /// Empty request
 message InfoRequest {}
 message InfoResponse {
    bool requires_padding = 1;
    string dtype = 2;
    string device_type = 3;
 }
 /// Empty request
 message ServiceDiscoveryRequest {}
--- a/router/client/src/client.rs
+++ b/router/client/src/client.rs
@ -54,6 +54,14 @@ impl Client {
        Ok(urls)
    }
    /// Get model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<InfoResponse> {
        let request = tonic::Request::new(InfoRequest {}).inject_context();
        let response = self.stub.info(request).await?.into_inner();
        Ok(response)
    }
    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@ -6,6 +6,7 @@ mod pb;
 mod sharded_client;
 pub use client::Client;
 pub use pb::generate::v1::InfoResponse as ShardInfo;
 pub use pb::generate::v1::{
    Batch, FinishReason, GeneratedText, Generation, NextTokenChooserParameters, PrefillTokens,
    Request, StoppingCriteriaParameters,
--- a/router/client/src/sharded_client.rs
+++ b/router/client/src/sharded_client.rs
@ -1,6 +1,6 @@
 /// Multi shard Client
 use crate::Result;
-use crate::{Batch, Client, Generation};
+use crate::{Batch, Client, Generation, ShardInfo};
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
@ -37,6 +37,17 @@ impl ShardedClient {
        Self::from_master_client(master_client).await
    }
    /// Get the model info
    #[instrument(skip(self))]
    pub async fn info(&mut self) -> Result<ShardInfo> {
        let futures: Vec<_> = self
            .clients
            .iter_mut()
            .map(|client| client.info())
            .collect();
        join_all(futures).await.pop().unwrap()
    }
    /// Clear the past generations cache
    #[instrument(skip(self))]
    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -12,7 +12,7 @@ use validation::Validation;
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
-pub struct ModelInfo {
+pub struct HubModelInfo {
    #[serde(rename(deserialize = "id"))]
    pub model_id: String,
    pub sha: Option<String>,
@ -25,6 +25,10 @@ pub struct Info {
    pub model_id: String,
    #[schema(nullable = true, example = "e985a63cdc139290c5f700ff1929f0b5942cced2")]
    pub model_sha: Option<String>,
    #[schema(example = "torch.float16")]
    pub model_dtype: String,
    #[schema(example = "cuda")]
    pub model_device_type: String,
    #[schema(nullable = true, example = "text-generation")]
    pub model_pipeline_tag: Option<String>,
    #[schema(example = "0.5.0")]
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -10,7 +10,7 @@ use opentelemetry_otlp::WithExportConfig;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::Path;
 use text_generation_client::ShardedClient;
-use text_generation_router::{server, ModelInfo};
+use text_generation_router::{server, HubModelInfo};
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tower_http::cors::AllowOrigin;
 use tracing_subscriber::layer::SubscriberExt;
@ -128,7 +128,7 @@ fn main() -> Result<(), std::io::Error> {
            // Get Model info
            let model_info = match local_model {
-                true => ModelInfo {
+                true => HubModelInfo {
                    model_id: tokenizer_name.clone(),
                    sha: None,
                    pipeline_tag: None,
@ -154,6 +154,11 @@ fn main() -> Result<(), std::io::Error> {
                .clear_cache(None)
                .await
                .expect("Unable to clear cache");
            // Get info from the shard
            let shard_info = sharded_client
                .info()
                .await
                .expect("Unable to get shard info");
            tracing::info!("Connected");
            // Binds on localhost
@ -162,6 +167,7 @@ fn main() -> Result<(), std::io::Error> {
            // Run server
            server::run(
                model_info,
                shard_info,
                compat_return_full_text,
                max_concurrent_requests,
                max_best_of,
@ -237,7 +243,7 @@ fn init_logging(otlp_endpoint: Option<String>, json_output: bool) {
 }
 /// get model info from the Huggingface Hub
-pub async fn get_model_info(model_id: &str, revision: &str, token: Option<String>) -> ModelInfo {
+pub async fn get_model_info(model_id: &str, revision: &str, token: Option<String>) -> HubModelInfo {
    let client = reqwest::Client::new();
    let mut builder = client.get(format!(
        "https://huggingface.co/api/models/{model_id}/revision/{revision}"
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -3,7 +3,7 @@ use crate::infer::{InferError, InferResponse, InferStreamResponse};
 use crate::validation::ValidationError;
 use crate::{
    BestOfSequence, CompatGenerateRequest, Details, ErrorResponse, FinishReason,
-    GenerateParameters, GenerateRequest, GenerateResponse, Infer, Info, ModelInfo, PrefillToken,
+    GenerateParameters, GenerateRequest, GenerateResponse, HubModelInfo, Infer, Info, PrefillToken,
    StreamDetails, StreamResponse, Token, Validation,
 };
 use axum::extract::Extension;
@ -18,7 +18,7 @@ use futures::Stream;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use std::convert::Infallible;
 use std::net::SocketAddr;
-use text_generation_client::ShardedClient;
+use text_generation_client::{ShardInfo, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::signal;
 use tokio::time::Instant;
@ -78,13 +78,19 @@ async fn compat_generate(
    responses((status = 200, description = "Served model info", body = Info))
 )]
 #[instrument]
-async fn get_model_info(model_info: Extension<ModelInfo>) -> Json<Info> {
+async fn get_model_info(
    model_info: Extension<HubModelInfo>,
    shard_info: Extension<ShardInfo>,
 ) -> Json<Info> {
    let model_info = model_info.0;
    let shard_info = shard_info.0;
    let info = Info {
        version: env!("CARGO_PKG_VERSION"),
        sha: option_env!("VERGEN_GIT_SHA"),
        model_id: model_info.model_id,
        model_sha: model_info.sha,
        model_dtype: shard_info.dtype,
        model_device_type: shard_info.device_type,
        model_pipeline_tag: model_info.pipeline_tag,
    };
    Json(info)
@ -497,7 +503,8 @@ async fn metrics(prom_handle: Extension<PrometheusHandle>) -> String {
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
-    model_info: ModelInfo,
+    model_info: HubModelInfo,
    shard_info: ShardInfo,
    compat_return_full_text: bool,
    max_concurrent_requests: usize,
    max_best_of: usize,
@ -641,6 +648,7 @@ pub async fn run(
        // Prometheus metrics route
        .route("/metrics", get(metrics))
        .layer(Extension(model_info))
        .layer(Extension(shard_info))
        .layer(Extension(compat_return_full_text))
        .layer(Extension(infer))
        .layer(Extension(prom_handle))
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@ -100,7 +100,11 @@ class BLOOMSharded(BLOOM):
        self.model = model.eval()
        torch.distributed.barrier(group=self.process_group)
        super(CausalLM, self).__init__(
-            tokenizer=tokenizer, device=device, decode_buffer=1
+            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=1,
        )
    @staticmethod
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@ -400,7 +400,11 @@ class CausalLM(Model):
        )
        super(CausalLM, self).__init__(
-            tokenizer=tokenizer, device=device, decode_buffer=decode_buffer
+            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=decode_buffer,
        )
    @property
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -343,7 +343,11 @@ class FlashCausalLM(Model):
        )
        super(FlashCausalLM, self).__init__(
-            tokenizer=tokenizer, device=device, decode_buffer=decode_buffer
+            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            decode_buffer=decode_buffer,
        )
    @property
--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@ -63,6 +63,8 @@ class FlashLlama(FlashCausalLM):
        super(FlashCausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
        )
@ -184,6 +186,8 @@ class FlashLlamaSharded(FlashLlama):
        torch.distributed.barrier(group=self.process_group)
        super(FlashCausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@ -70,6 +70,8 @@ class FlashNeoXSharded(FlashNeoX):
        torch.distributed.barrier(group=self.process_group)
        super(FlashCausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@ -65,7 +65,11 @@ class FlashSantacoder(FlashCausalLM):
        self.model = model.eval().to(device)
        super(FlashCausalLM, self).__init__(
-            tokenizer=tokenizer, device=device, decode_buffer=1
+            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            decode_buffer=1,
        )
    @staticmethod
@ -206,6 +210,8 @@ class FlashSantacoderSharded(FlashSantacoder):
        torch.distributed.barrier(group=self.process_group)
        super(FlashCausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/models/galactica.py
+++ b/server/text_generation_server/models/galactica.py
@ -228,6 +228,8 @@ class GalacticaSharded(Galactica):
        torch.distributed.barrier(group=self.process_group)
        super(CausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/models/gpt_neox.py
+++ b/server/text_generation_server/models/gpt_neox.py
@ -72,6 +72,8 @@ class GPTNeoxSharded(CausalLM):
        torch.distributed.barrier(group=self.process_group)
        super(CausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -5,6 +5,7 @@ from typing import List, Tuple, Optional, TypeVar, Type
 from transformers import PreTrainedTokenizerBase
 from text_generation_server.models.types import Batch, GeneratedText
 from text_generation_server.pb.generate_pb2 import InfoResponse
 B = TypeVar("B", bound=Batch)
@ -13,6 +14,8 @@ class Model(ABC):
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        decode_buffer: int = 3,
    ):
@ -21,9 +24,19 @@ class Model(ABC):
        self.tokenizer = tokenizer
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.decode_buffer = decode_buffer
    @property
    def info(self) -> InfoResponse:
        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
        )
    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@ -88,6 +88,8 @@ class OPTSharded(OPT):
        torch.distributed.barrier(group=self.process_group)
        super(CausalLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@ -54,7 +54,11 @@ class SantaCoder(CausalLM):
        )
        super(CausalLM, self).__init__(
-            tokenizer=tokenizer, device=device, decode_buffer=1
+            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=1,
        )
    def decode(self, generated_ids: List[int]) -> str:
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@ -460,7 +460,11 @@ class Seq2SeqLM(Model):
        tokenizer.bos_token_id = self.model.config.decoder_start_token_id
        super(Seq2SeqLM, self).__init__(
-            tokenizer=tokenizer, device=device, decode_buffer=decode_buffer
+            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
            decode_buffer=decode_buffer,
        )
    @property
--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@ -72,6 +72,8 @@ class T5Sharded(Seq2SeqLM):
        torch.distributed.barrier(group=self.process_group)
        super(Seq2SeqLM, self).__init__(
            tokenizer=tokenizer,
            requires_padding=True,
            dtype=dtype,
            device=device,
        )
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -26,6 +26,9 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
            # Force inference mode for the lifetime of TextGenerationService
            self._inference_mode_raii_guard = torch._C._InferenceMode(True)
    async def Info(self, request, context):
        return self.model.info
    async def ServiceDiscovery(self, request, context):
        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)