diff --git a/.gitignore b/.gitignore index 4270a1ae..9434d75c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ router/tokenizer.json backends/v2/src/client/pb backends/v3/src/client/pb +backends/client/src/v2/pb +backends/client/src/v3/pb # ROCm auto-generated files *.hip diff --git a/backends/client/src/v2/pb/generate.v2.rs b/backends/client/src/v2/pb/generate.v2.rs deleted file mode 100644 index 3af5670b..00000000 --- a/backends/client/src/v2/pb/generate.v2.rs +++ /dev/null @@ -1,613 +0,0 @@ -// This file is @generated by prost-build. -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct HealthRequest {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct HealthResponse {} -/// / Empty request -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct InfoRequest {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct InfoResponse { - #[prost(bool, tag = "1")] - pub requires_padding: bool, - #[prost(string, tag = "2")] - pub dtype: ::prost::alloc::string::String, - #[prost(string, tag = "3")] - pub device_type: ::prost::alloc::string::String, - #[prost(uint32, optional, tag = "4")] - pub window_size: ::core::option::Option, - #[prost(uint32, tag = "5")] - pub speculate: u32, -} -/// / Empty request -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ServiceDiscoveryRequest {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ServiceDiscoveryResponse { - /// / Other shards urls - #[prost(string, repeated, tag = "1")] - pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ClearCacheRequest { - /// / Optional batch id - #[prost(uint64, optional, tag = "1")] - pub id: ::core::option::Option, -} -/// / Empty response -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ClearCacheResponse {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct NextTokenChooserParameters { - /// / exponential scaling output probability distribution - #[prost(float, tag = "1")] - pub temperature: f32, - /// / restricting to the k highest probability elements - #[prost(uint32, tag = "2")] - pub top_k: u32, - /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off - #[prost(float, tag = "3")] - pub top_p: f32, - /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off - #[prost(float, tag = "4")] - pub typical_p: f32, - /// / apply sampling on the logits - #[prost(bool, tag = "5")] - pub do_sample: bool, - /// / random seed for sampling - #[prost(uint64, tag = "6")] - pub seed: u64, - /// / repetition penalty - #[prost(float, tag = "7")] - pub repetition_penalty: f32, - /// / frequency penalty - #[prost(float, tag = "9")] - pub frequency_penalty: f32, - /// / token watermarking using "A Watermark for Large Language Models" - #[prost(bool, tag = "8")] - pub watermark: bool, - /// / grammar (applied if not empty) - #[prost(string, tag = "10")] - pub grammar: ::prost::alloc::string::String, - /// / grammar type - #[prost(enumeration = "GrammarType", tag = "11")] - pub grammar_type: i32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct StoppingCriteriaParameters { - /// / Maximum number of generated tokens - #[prost(uint32, tag = "1")] - pub max_new_tokens: u32, - /// / Optional stopping sequences - #[prost(string, repeated, tag = "2")] - pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// / Ignore end of sequence token - /// / used for benchmarking - #[prost(bool, tag = "3")] - pub ignore_eos_token: bool, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Request { - /// / Request ID - #[prost(uint64, tag = "1")] - pub id: u64, - /// / The generation context - #[prost(string, tag = "2")] - pub inputs: ::prost::alloc::string::String, - /// / Context truncation - #[prost(uint32, tag = "3")] - pub truncate: u32, - /// / Next Token Chooser Parameters - #[prost(message, optional, tag = "4")] - pub parameters: ::core::option::Option, - /// / Stopping Criteria Parameters - #[prost(message, optional, tag = "5")] - pub stopping_parameters: ::core::option::Option, - /// / Return prefill logprobs - #[prost(bool, tag = "6")] - pub prefill_logprobs: bool, - /// / Return most likely n tokens - #[prost(uint32, tag = "7")] - pub top_n_tokens: u32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Batch { - /// / Batch ID - #[prost(uint64, tag = "1")] - pub id: u64, - /// / Individual requests - #[prost(message, repeated, tag = "2")] - pub requests: ::prost::alloc::vec::Vec, - /// / Batch size (==len(requests)) - #[prost(uint32, tag = "3")] - pub size: u32, - /// / Maximum number of tokens this batch will grow to - #[prost(uint32, tag = "4")] - pub max_tokens: u32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct CachedBatch { - /// / Batch ID - #[prost(uint64, tag = "1")] - pub id: u64, - /// / Individual requests ids - #[prost(uint64, repeated, tag = "2")] - pub request_ids: ::prost::alloc::vec::Vec, - /// / Batch size (==len(requests)) - #[prost(uint32, tag = "3")] - pub size: u32, - /// / Maximum number of tokens this batch will grow to - #[prost(uint32, tag = "4")] - pub max_tokens: u32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct GeneratedText { - /// / Output - #[prost(string, tag = "1")] - pub text: ::prost::alloc::string::String, - /// / Number of generated tokens - #[prost(uint32, tag = "2")] - pub generated_tokens: u32, - /// / Finish reason - #[prost(enumeration = "FinishReason", tag = "3")] - pub finish_reason: i32, - /// / Seed - #[prost(uint64, optional, tag = "4")] - pub seed: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Tokens { - /// / Token IDs - #[prost(uint32, repeated, tag = "1")] - pub ids: ::prost::alloc::vec::Vec, - /// / Logprobs - #[prost(float, repeated, tag = "2")] - pub logprobs: ::prost::alloc::vec::Vec, - /// / tokens - #[prost(string, repeated, tag = "3")] - pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// / special - #[prost(bool, repeated, tag = "4")] - pub is_special: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Generation { - /// / Request ID - #[prost(uint64, tag = "1")] - pub request_id: u64, - /// / Prefill tokens (optional) - #[prost(message, optional, tag = "2")] - pub prefill_tokens: ::core::option::Option, - #[prost(message, optional, tag = "3")] - pub tokens: ::core::option::Option, - /// / Complete generated text - #[prost(message, optional, tag = "4")] - pub generated_text: ::core::option::Option, - /// / Top tokens - #[prost(message, repeated, tag = "5")] - pub top_tokens: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FilterBatchRequest { - /// / Batch ID - #[prost(uint64, tag = "1")] - pub batch_id: u64, - /// / Requests to keep - #[prost(uint64, repeated, tag = "2")] - pub request_ids: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FilterBatchResponse { - /// / Filtered Batch (cached) - #[prost(message, optional, tag = "1")] - pub batch: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct PrefillRequest { - /// / Batch - #[prost(message, optional, tag = "1")] - pub batch: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct PrefillResponse { - /// / Generation - #[prost(message, repeated, tag = "1")] - pub generations: ::prost::alloc::vec::Vec, - /// / Next batch (cached) - #[prost(message, optional, tag = "2")] - pub batch: ::core::option::Option, - /// / Forward elapsed time in nanoseconds - #[prost(uint64, tag = "3")] - pub forward_ns: u64, - /// / Decode elapsed time in nanoseconds - #[prost(uint64, tag = "4")] - pub decode_ns: u64, - /// / Total elapsed time in nanoseconds - #[prost(uint64, tag = "5")] - pub total_ns: u64, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct DecodeRequest { - /// / Cached batches - #[prost(message, repeated, tag = "1")] - pub batches: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct DecodeResponse { - /// / Decodes - #[prost(message, repeated, tag = "1")] - pub generations: ::prost::alloc::vec::Vec, - /// / Next batch (cached) - #[prost(message, optional, tag = "2")] - pub batch: ::core::option::Option, - /// / Forward elapsed time in nanoseconds - #[prost(uint64, tag = "3")] - pub forward_ns: u64, - /// / Decode elapsed time in nanoseconds - #[prost(uint64, tag = "4")] - pub decode_ns: u64, - /// / Total elapsed time in nanoseconds - #[prost(uint64, tag = "5")] - pub total_ns: u64, - /// / Concatenate elapsed time in nanoseconds - #[prost(uint64, optional, tag = "6")] - pub concat_ns: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct WarmupRequest { - /// / Batch to warmup on - #[prost(message, optional, tag = "1")] - pub batch: ::core::option::Option, - #[prost(uint32, tag = "2")] - pub max_input_length: u32, - #[prost(uint32, tag = "3")] - pub max_prefill_tokens: u32, - #[prost(uint32, tag = "4")] - pub max_total_tokens: u32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct WarmupResponse { - /// / Maximum number of tokens supported by the model - #[prost(uint32, optional, tag = "1")] - pub max_supported_total_tokens: ::core::option::Option, -} -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] -pub enum GrammarType { - None = 0, - Json = 1, - Regex = 2, -} -impl GrammarType { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - GrammarType::None => "GRAMMAR_TYPE_NONE", - GrammarType::Json => "GRAMMAR_TYPE_JSON", - GrammarType::Regex => "GRAMMAR_TYPE_REGEX", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "GRAMMAR_TYPE_NONE" => Some(Self::None), - "GRAMMAR_TYPE_JSON" => Some(Self::Json), - "GRAMMAR_TYPE_REGEX" => Some(Self::Regex), - _ => None, - } - } -} -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] -pub enum FinishReason { - Length = 0, - EosToken = 1, - StopSequence = 2, -} -impl FinishReason { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - FinishReason::Length => "FINISH_REASON_LENGTH", - FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN", - FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "FINISH_REASON_LENGTH" => Some(Self::Length), - "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken), - "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence), - _ => None, - } - } -} -/// Generated client implementations. -pub mod text_generation_service_client { - #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::http::Uri; - use tonic::codegen::*; - #[derive(Debug, Clone)] - pub struct TextGenerationServiceClient { - inner: tonic::client::Grpc, - } - impl TextGenerationServiceClient { - /// Attempt to create a new client by connecting to a given endpoint. - pub async fn connect(dst: D) -> Result - where - D: TryInto, - D::Error: Into, - { - let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; - Ok(Self::new(conn)) - } - } - impl TextGenerationServiceClient - where - T: tonic::client::GrpcService, - T::Error: Into, - T::ResponseBody: Body + Send + 'static, - ::Error: Into + Send, - { - pub fn new(inner: T) -> Self { - let inner = tonic::client::Grpc::new(inner); - Self { inner } - } - pub fn with_origin(inner: T, origin: Uri) -> Self { - let inner = tonic::client::Grpc::with_origin(inner, origin); - Self { inner } - } - pub fn with_interceptor( - inner: T, - interceptor: F, - ) -> TextGenerationServiceClient> - where - F: tonic::service::Interceptor, - T::ResponseBody: Default, - T: tonic::codegen::Service< - http::Request, - Response = http::Response< - >::ResponseBody, - >, - >, - >>::Error: - Into + Send + Sync, - { - TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor)) - } - /// Compress requests with the given encoding. - /// - /// This requires the server to support it otherwise it might respond with an - /// error. - #[must_use] - pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.inner = self.inner.send_compressed(encoding); - self - } - /// Enable decompressing responses. - #[must_use] - pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.inner = self.inner.accept_compressed(encoding); - self - } - /// Limits the maximum size of a decoded message. - /// - /// Default: `4MB` - #[must_use] - pub fn max_decoding_message_size(mut self, limit: usize) -> Self { - self.inner = self.inner.max_decoding_message_size(limit); - self - } - /// Limits the maximum size of an encoded message. - /// - /// Default: `usize::MAX` - #[must_use] - pub fn max_encoding_message_size(mut self, limit: usize) -> Self { - self.inner = self.inner.max_encoding_message_size(limit); - self - } - /// / Model Info - pub async fn info( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Info"); - let mut req = request.into_request(); - req.extensions_mut() - .insert(GrpcMethod::new("generate.v2.TextGenerationService", "Info")); - self.inner.unary(req, path, codec).await - } - /// / Service discovery - pub async fn service_discovery( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/generate.v2.TextGenerationService/ServiceDiscovery", - ); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "ServiceDiscovery", - )); - self.inner.unary(req, path, codec).await - } - /// / Empties batch cache - pub async fn clear_cache( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/generate.v2.TextGenerationService/ClearCache", - ); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "ClearCache", - )); - self.inner.unary(req, path, codec).await - } - /// / Remove requests from a cached batch - pub async fn filter_batch( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/generate.v2.TextGenerationService/FilterBatch", - ); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "FilterBatch", - )); - self.inner.unary(req, path, codec).await - } - /// / Warmup the model and compute max cache size - pub async fn warmup( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Warmup"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "Warmup", - )); - self.inner.unary(req, path, codec).await - } - /// / Prefill batch and decode first token - pub async fn prefill( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Prefill"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "Prefill", - )); - self.inner.unary(req, path, codec).await - } - /// / Decode token for a list of prefilled batches - pub async fn decode( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Decode"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "Decode", - )); - self.inner.unary(req, path, codec).await - } - /// / Health check - pub async fn health( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v2.TextGenerationService/Health"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v2.TextGenerationService", - "Health", - )); - self.inner.unary(req, path, codec).await - } - } -} diff --git a/backends/client/src/v2/pb/mod.rs b/backends/client/src/v2/pb/mod.rs deleted file mode 100644 index 095ead1f..00000000 --- a/backends/client/src/v2/pb/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -// This file is @generated by prost-build. -pub mod generate { - pub mod v2 { - include!("generate.v2.rs"); - } -} diff --git a/backends/client/src/v3/pb/generate.v3.rs b/backends/client/src/v3/pb/generate.v3.rs deleted file mode 100644 index 95423e30..00000000 --- a/backends/client/src/v3/pb/generate.v3.rs +++ /dev/null @@ -1,699 +0,0 @@ -// This file is @generated by prost-build. -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct HealthRequest {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct HealthResponse {} -/// / Empty request -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct InfoRequest {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct InfoResponse { - #[prost(bool, tag = "1")] - pub requires_padding: bool, - #[prost(string, tag = "2")] - pub dtype: ::prost::alloc::string::String, - #[prost(string, tag = "3")] - pub device_type: ::prost::alloc::string::String, - #[prost(uint32, optional, tag = "4")] - pub window_size: ::core::option::Option, - #[prost(uint32, tag = "5")] - pub speculate: u32, - #[prost(bool, tag = "6")] - pub support_chunking: bool, - #[prost(bool, tag = "7")] - pub use_prefix_caching: bool, - #[prost(string, tag = "8")] - pub attention_impl: ::prost::alloc::string::String, - #[prost(uint32, tag = "9")] - pub block_size: u32, -} -/// / Empty request -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ServiceDiscoveryRequest {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ServiceDiscoveryResponse { - /// / Other shards urls - #[prost(string, repeated, tag = "1")] - pub urls: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ClearCacheRequest { - /// / Optional batch id - #[prost(uint64, optional, tag = "1")] - pub id: ::core::option::Option, -} -/// / Empty response -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct ClearCacheResponse {} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Image { - /// / Binary image data. - #[prost(bytes = "vec", tag = "1")] - pub data: ::prost::alloc::vec::Vec, - /// / Image MIME type. - #[prost(string, tag = "2")] - pub mimetype: ::prost::alloc::string::String, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct InputChunk { - #[prost(oneof = "input_chunk::Chunk", tags = "1, 2")] - pub chunk: ::core::option::Option, -} -/// Nested message and enum types in `InputChunk`. -pub mod input_chunk { - #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] - pub enum Chunk { - /// / Plain text data - #[prost(string, tag = "1")] - Text(::prost::alloc::string::String), - /// / Image data - #[prost(message, tag = "2")] - Image(super::Image), - } -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Input { - #[prost(message, repeated, tag = "1")] - pub chunks: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct NextTokenChooserParameters { - /// / exponential scaling output probability distribution - #[prost(float, tag = "1")] - pub temperature: f32, - /// / restricting to the k highest probability elements - #[prost(uint32, tag = "2")] - pub top_k: u32, - /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off - #[prost(float, tag = "3")] - pub top_p: f32, - /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off - #[prost(float, tag = "4")] - pub typical_p: f32, - /// / apply sampling on the logits - #[prost(bool, tag = "5")] - pub do_sample: bool, - /// / random seed for sampling - #[prost(uint64, tag = "6")] - pub seed: u64, - /// / repetition penalty - #[prost(float, tag = "7")] - pub repetition_penalty: f32, - /// / frequency penalty - #[prost(float, tag = "9")] - pub frequency_penalty: f32, - /// / token watermarking using "A Watermark for Large Language Models" - #[prost(bool, tag = "8")] - pub watermark: bool, - /// / grammar (applied if not empty) - #[prost(string, tag = "10")] - pub grammar: ::prost::alloc::string::String, - /// / grammar type - #[prost(enumeration = "GrammarType", tag = "11")] - pub grammar_type: i32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct StoppingCriteriaParameters { - /// / Maximum number of generated tokens - #[prost(uint32, tag = "1")] - pub max_new_tokens: u32, - /// / Optional stopping sequences - #[prost(string, repeated, tag = "2")] - pub stop_sequences: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// / Ignore end of sequence token - /// / used for benchmarking - #[prost(bool, tag = "3")] - pub ignore_eos_token: bool, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Request { - /// / Request ID - #[prost(uint64, tag = "1")] - pub id: u64, - /// / The generation context as chunks - #[prost(message, optional, tag = "8")] - pub input_chunks: ::core::option::Option, - /// / The generation context, stringified input_chunks - #[prost(string, tag = "2")] - pub inputs: ::prost::alloc::string::String, - /// / Context truncation - #[prost(uint32, tag = "3")] - pub truncate: u32, - /// / Next Token Chooser Parameters - #[prost(message, optional, tag = "4")] - pub parameters: ::core::option::Option, - /// / Stopping Criteria Parameters - #[prost(message, optional, tag = "5")] - pub stopping_parameters: ::core::option::Option, - /// / Return prefill logprobs - #[prost(bool, tag = "6")] - pub prefill_logprobs: bool, - /// / Return most likely n tokens - #[prost(uint32, tag = "7")] - pub top_n_tokens: u32, - /// / Paged attention blocks - #[prost(uint32, repeated, tag = "9")] - pub blocks: ::prost::alloc::vec::Vec, - /// / Paged attention slots - #[prost(uint32, repeated, tag = "10")] - pub slots: ::prost::alloc::vec::Vec, - /// / LORA adapter index - #[prost(string, optional, tag = "11")] - pub adapter_id: ::core::option::Option<::prost::alloc::string::String>, - /// / Tokens that can be retrieved from the KV cache. - /// / This value is set for the first prefill and never reset - #[prost(uint32, tag = "12")] - pub cache_len: u32, - /// / Context truncation - #[prost(bool, tag = "13")] - pub add_special_tokens: bool, - /// / Chunk of tokens that must be computed for the first prefill - /// / This value is set for the first prefill and never reset - #[prost(uint32, optional, tag = "14")] - pub chunk_len: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Batch { - /// / Batch ID - #[prost(uint64, tag = "1")] - pub id: u64, - /// / Individual requests - #[prost(message, repeated, tag = "2")] - pub requests: ::prost::alloc::vec::Vec, - /// / Batch size (==len(requests)) - #[prost(uint32, tag = "3")] - pub size: u32, - /// / Maximum number of tokens this batch will grow to - #[prost(uint32, tag = "4")] - pub max_tokens: u32, - /// / Maximum number of Paged Attention blocks - #[prost(uint32, tag = "5")] - pub max_blocks: u32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct CachedBatch { - /// / Batch ID - #[prost(uint64, tag = "1")] - pub id: u64, - /// / Individual requests ids - #[prost(uint64, repeated, tag = "2")] - pub request_ids: ::prost::alloc::vec::Vec, - /// / Batch size (==len(requests)) - #[prost(uint32, tag = "3")] - pub size: u32, - /// / Maximum number of tokens this batch will grow to - #[prost(uint32, tag = "4")] - pub max_tokens: u32, - /// / Number of tokens in the next forward - #[prost(uint32, tag = "5")] - pub current_tokens: u32, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct GeneratedText { - /// / Output - #[prost(string, tag = "1")] - pub text: ::prost::alloc::string::String, - /// / Number of generated tokens - #[prost(uint32, tag = "2")] - pub generated_tokens: u32, - /// / Finish reason - #[prost(enumeration = "FinishReason", tag = "3")] - pub finish_reason: i32, - /// / Seed - #[prost(uint64, optional, tag = "4")] - pub seed: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Tokens { - /// / Token IDs - #[prost(uint32, repeated, tag = "1")] - pub ids: ::prost::alloc::vec::Vec, - /// / Logprobs - #[prost(float, repeated, tag = "2")] - pub logprobs: ::prost::alloc::vec::Vec, - /// / tokens - #[prost(string, repeated, tag = "3")] - pub texts: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// / special - #[prost(bool, repeated, tag = "4")] - pub is_special: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct Generation { - /// / Request ID - #[prost(uint64, tag = "1")] - pub request_id: u64, - /// / Prefill tokens (optional) - #[prost(message, optional, tag = "2")] - pub prefill_tokens: ::core::option::Option, - #[prost(message, optional, tag = "3")] - pub tokens: ::core::option::Option, - /// / Complete generated text - #[prost(message, optional, tag = "4")] - pub generated_text: ::core::option::Option, - /// / Top tokens - #[prost(message, repeated, tag = "5")] - pub top_tokens: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FilterBatchRequest { - /// / Batch ID - #[prost(uint64, tag = "1")] - pub batch_id: u64, - /// / Requests to keep - #[prost(uint64, repeated, tag = "2")] - pub request_ids: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct FilterBatchResponse { - /// / Filtered Batch (cached) - #[prost(message, optional, tag = "1")] - pub batch: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct PrefillRequest { - /// / Batch - #[prost(message, optional, tag = "1")] - pub batch: ::core::option::Option, - /// / Optional cached batch - #[prost(message, optional, tag = "2")] - pub cached_batch: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct PrefillResponse { - /// / Generation - #[prost(message, repeated, tag = "1")] - pub generations: ::prost::alloc::vec::Vec, - /// / Next batch (cached) - #[prost(message, optional, tag = "2")] - pub batch: ::core::option::Option, - /// / Forward elapsed time in nanoseconds - #[prost(uint64, tag = "3")] - pub forward_ns: u64, - /// / Decode elapsed time in nanoseconds - #[prost(uint64, tag = "4")] - pub decode_ns: u64, - /// / Total elapsed time in nanoseconds - #[prost(uint64, tag = "5")] - pub total_ns: u64, - /// / Concatenate elapsed time in nanoseconds - #[prost(uint64, optional, tag = "6")] - pub concat_ns: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct DecodeRequest { - /// / Cached batches - #[prost(message, repeated, tag = "1")] - pub batches: ::prost::alloc::vec::Vec, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct DecodeResponse { - /// / Decodes - #[prost(message, repeated, tag = "1")] - pub generations: ::prost::alloc::vec::Vec, - /// / Next batch (cached) - #[prost(message, optional, tag = "2")] - pub batch: ::core::option::Option, - /// / Forward elapsed time in nanoseconds - #[prost(uint64, tag = "3")] - pub forward_ns: u64, - /// / Decode elapsed time in nanoseconds - #[prost(uint64, tag = "4")] - pub decode_ns: u64, - /// / Total elapsed time in nanoseconds - #[prost(uint64, tag = "5")] - pub total_ns: u64, - /// / Concatenate elapsed time in nanoseconds - #[prost(uint64, optional, tag = "6")] - pub concat_ns: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct WarmupRequest { - /// / Batch to warmup on - #[prost(message, optional, tag = "1")] - pub batch: ::core::option::Option, - #[prost(uint32, optional, tag = "2")] - pub max_input_tokens: ::core::option::Option, - #[prost(uint32, tag = "3")] - pub max_prefill_tokens: u32, - #[prost(uint32, optional, tag = "4")] - pub max_total_tokens: ::core::option::Option, -} -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct WarmupResponse { - /// / Maximum number of tokens supported by the model - #[prost(uint32, optional, tag = "1")] - pub max_supported_total_tokens: ::core::option::Option, - /// / Maximum input tokens by clients should be equal to request value if it's set - /// / Otherwise warmup automatically allocates a value here - #[prost(uint32, tag = "2")] - pub max_input_tokens: u32, - /// / Maximum total tokens by clients should be equal to request value if it's set - /// / Otherwise warmup automatically allocates a value here - #[prost(uint32, tag = "3")] - pub max_total_tokens: u32, -} -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] -pub enum GrammarType { - None = 0, - Json = 1, - Regex = 2, -} -impl GrammarType { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - GrammarType::None => "GRAMMAR_TYPE_NONE", - GrammarType::Json => "GRAMMAR_TYPE_JSON", - GrammarType::Regex => "GRAMMAR_TYPE_REGEX", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "GRAMMAR_TYPE_NONE" => Some(Self::None), - "GRAMMAR_TYPE_JSON" => Some(Self::Json), - "GRAMMAR_TYPE_REGEX" => Some(Self::Regex), - _ => None, - } - } -} -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] -pub enum FinishReason { - Length = 0, - EosToken = 1, - StopSequence = 2, -} -impl FinishReason { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - FinishReason::Length => "FINISH_REASON_LENGTH", - FinishReason::EosToken => "FINISH_REASON_EOS_TOKEN", - FinishReason::StopSequence => "FINISH_REASON_STOP_SEQUENCE", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "FINISH_REASON_LENGTH" => Some(Self::Length), - "FINISH_REASON_EOS_TOKEN" => Some(Self::EosToken), - "FINISH_REASON_STOP_SEQUENCE" => Some(Self::StopSequence), - _ => None, - } - } -} -/// Generated client implementations. -pub mod text_generation_service_client { - #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::http::Uri; - use tonic::codegen::*; - #[derive(Debug, Clone)] - pub struct TextGenerationServiceClient { - inner: tonic::client::Grpc, - } - impl TextGenerationServiceClient { - /// Attempt to create a new client by connecting to a given endpoint. - pub async fn connect(dst: D) -> Result - where - D: TryInto, - D::Error: Into, - { - let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; - Ok(Self::new(conn)) - } - } - impl TextGenerationServiceClient - where - T: tonic::client::GrpcService, - T::Error: Into, - T::ResponseBody: Body + Send + 'static, - ::Error: Into + Send, - { - pub fn new(inner: T) -> Self { - let inner = tonic::client::Grpc::new(inner); - Self { inner } - } - pub fn with_origin(inner: T, origin: Uri) -> Self { - let inner = tonic::client::Grpc::with_origin(inner, origin); - Self { inner } - } - pub fn with_interceptor( - inner: T, - interceptor: F, - ) -> TextGenerationServiceClient> - where - F: tonic::service::Interceptor, - T::ResponseBody: Default, - T: tonic::codegen::Service< - http::Request, - Response = http::Response< - >::ResponseBody, - >, - >, - >>::Error: - Into + Send + Sync, - { - TextGenerationServiceClient::new(InterceptedService::new(inner, interceptor)) - } - /// Compress requests with the given encoding. - /// - /// This requires the server to support it otherwise it might respond with an - /// error. - #[must_use] - pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.inner = self.inner.send_compressed(encoding); - self - } - /// Enable decompressing responses. - #[must_use] - pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.inner = self.inner.accept_compressed(encoding); - self - } - /// Limits the maximum size of a decoded message. - /// - /// Default: `4MB` - #[must_use] - pub fn max_decoding_message_size(mut self, limit: usize) -> Self { - self.inner = self.inner.max_decoding_message_size(limit); - self - } - /// Limits the maximum size of an encoded message. - /// - /// Default: `usize::MAX` - #[must_use] - pub fn max_encoding_message_size(mut self, limit: usize) -> Self { - self.inner = self.inner.max_encoding_message_size(limit); - self - } - /// / Model Info - pub async fn info( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Info"); - let mut req = request.into_request(); - req.extensions_mut() - .insert(GrpcMethod::new("generate.v3.TextGenerationService", "Info")); - self.inner.unary(req, path, codec).await - } - /// / Service discovery - pub async fn service_discovery( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/generate.v3.TextGenerationService/ServiceDiscovery", - ); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "ServiceDiscovery", - )); - self.inner.unary(req, path, codec).await - } - /// / Empties batch cache - pub async fn clear_cache( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/generate.v3.TextGenerationService/ClearCache", - ); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "ClearCache", - )); - self.inner.unary(req, path, codec).await - } - /// / Remove requests from a cached batch - pub async fn filter_batch( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/generate.v3.TextGenerationService/FilterBatch", - ); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "FilterBatch", - )); - self.inner.unary(req, path, codec).await - } - /// / Warmup the model and compute max cache size - pub async fn warmup( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Warmup"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "Warmup", - )); - self.inner.unary(req, path, codec).await - } - /// / Prefill batch and decode first token - pub async fn prefill( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Prefill"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "Prefill", - )); - self.inner.unary(req, path, codec).await - } - /// / Decode token for a list of prefilled batches - pub async fn decode( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Decode"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "Decode", - )); - self.inner.unary(req, path, codec).await - } - /// / Health check - pub async fn health( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/generate.v3.TextGenerationService/Health"); - let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "generate.v3.TextGenerationService", - "Health", - )); - self.inner.unary(req, path, codec).await - } - } -} diff --git a/backends/client/src/v3/pb/mod.rs b/backends/client/src/v3/pb/mod.rs deleted file mode 100644 index b5397d05..00000000 --- a/backends/client/src/v3/pb/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -// This file is @generated by prost-build. -pub mod generate { - pub mod v3 { - include!("generate.v3.rs"); - } -}