feat: add SchedulerV3 (#1996)

- Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl
2024-06-04 15:56:56 +02:00 · 2024-06-04 15:56:56 +02:00 · 757223b352
parent fec0167a12
commit 757223b352
32 changed files with 3798 additions and 912 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4,9 +4,9 @@ version = 3

 [[package]]
 name = "addr2line"
-version = "0.21.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
 dependencies = [
 "gimli",
 ]
@ -350,9 +350,9 @@ dependencies = [

 [[package]]
 name = "backtrace"
-version = "0.3.71"
+version = "0.3.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d"
+checksum = "17c6a35df3749d2e8bb1b7b21a976d82b15548788d2735b9d82f329268f71a11"
 dependencies = [
 "addr2line",
 "cc",
@ -1138,9 +1138,9 @@ dependencies = [

 [[package]]
 name = "gimli"
-version = "0.28.1"
+version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"

 [[package]]
 name = "glob"
@ -1396,9 +1396,9 @@ dependencies = [

 [[package]]
 name = "hyper-util"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d8d52be92d09acc2e01dddb7fde3ad983fc6489c7db4837e605bc3fca4cb63e"
+checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56"
 dependencies = [
 "bytes",
 "futures-util",
@ -1938,11 +1938,10 @@ dependencies = [

 [[package]]
 name = "native-tls"
-version = "0.2.11"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
 dependencies = [
- "lazy_static",
 "libc",
 "log",
 "openssl",
@ -2168,9 +2167,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"

 [[package]]
 name = "object"
-version = "0.32.2"
+version = "0.35.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+checksum = "b8ec7ab813848ba4522158d5517a6093db1ded27575b070f4177b8d12b41db5e"
 dependencies = [
 "memchr",
 ]
@ -2563,9 +2562,9 @@ dependencies = [

 [[package]]
 name = "proc-macro2"
-version = "1.0.84"
+version = "1.0.85"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6"
+checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23"
 dependencies = [
 "unicode-ident",
 ]
@ -3554,6 +3553,7 @@ dependencies = [
 name = "text-generation-client"
 version = "2.0.5-dev0"
 dependencies = [
+ "async-trait",
 "base64 0.22.1",
 "futures",
 "grpc-metadata",
@ -3752,9 +3752,9 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.37.0"
+version = "1.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787"
+checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
 dependencies = [
 "backtrace",
 "bytes",
@ -3781,9 +3781,9 @@ dependencies = [

 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
 "proc-macro2",
 "quote",
@ -4733,9 +4733,9 @@ checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"

 [[package]]
 name = "winnow"
-version = "0.6.8"
+version = "0.6.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3c52e9c97a68071b23e836c9380edae937f17b9c4667bd021973efc689f618d"
+checksum = "86c949fede1d13936a99f14fafd3e76fd642b556dd2ce96287fbe2e0151bfac6"
 dependencies = [
 "memchr",
 ]
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -1,8 +1,9 @@
 use std::time::{Duration, Instant};
-use text_generation_client::{
-    Batch, CachedBatch, Chunk, ClientError, Input, NextTokenChooserParameters, Request,
-    ShardedClient, StoppingCriteriaParameters,
+use text_generation_client::v3::{
+    Batch, CachedBatch, NextTokenChooserParameters, Request, ShardedClient,
+    StoppingCriteriaParameters,
 };
+use text_generation_client::{Chunk, ClientError, Input};
 use tokenizers::{Tokenizer, TruncationDirection};
 use tokio::sync::{broadcast, mpsc};

--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@ -8,7 +8,7 @@ use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::{GrammarType, NextTokenChooserParameters, ShardedClient};
+use text_generation_client::v3::{GrammarType, NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@ -4,7 +4,7 @@
 /// and: https://github.com/orhun/rust-tui-template
 use clap::Parser;
 use std::path::Path;
-use text_generation_client::ShardedClient;
+use text_generation_client::v3::ShardedClient;
 use tokenizers::{FromPretrainedParameters, Tokenizer};
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -51,27 +51,6 @@ message ClearCacheRequest {
 /// Empty response
 message ClearCacheResponse {}

-message Image {
-    /// Binary image data.
-    bytes data = 1;
-
-    /// Image MIME type.
-    string mimetype = 2;
-}
-
-message InputChunk {
-    oneof chunk {
-        /// Plain text data
-        string text = 1;
-        /// Image data
-        Image image = 2;
-    }
-}
-
-message Input {
-    repeated InputChunk chunks = 1;
-  }
-
 enum GrammarType {
    GRAMMAR_TYPE_NONE = 0;
    GRAMMAR_TYPE_JSON = 1;
@ -116,9 +95,7 @@ message StoppingCriteriaParameters {
 message Request {
    /// Request ID
    uint64 id = 1;
-    /// The generation context as chunks
-    Input input_chunks = 8;
-    /// The generation context, stringified input_chunks
+    /// The generation context
    string inputs = 2;
    /// Context truncation
    uint32 truncate = 3;
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@ -0,0 +1,259 @@
+syntax = "proto3";
+
+package generate.v3;
+
+service TextGenerationService {
+    /// Model Info
+    rpc Info (InfoRequest) returns (InfoResponse) {}
+    /// Service discovery
+    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
+    /// Empties batch cache
+    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
+    /// Remove requests from a cached batch
+    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
+    /// Warmup the model and compute max cache size
+    rpc Warmup (WarmupRequest) returns (WarmupResponse);
+    /// Prefill batch and decode first token
+    rpc Prefill (PrefillRequest) returns (PrefillResponse);
+    /// Decode token for a list of prefilled batches
+    rpc Decode (DecodeRequest) returns (DecodeResponse);
+    /// Health check
+    rpc Health (HealthRequest) returns (HealthResponse);
+}
+
+message HealthRequest {}
+message HealthResponse {}
+
+/// Empty request
+message InfoRequest {}
+
+message InfoResponse {
+    bool requires_padding = 1;
+    string dtype = 2;
+    string device_type = 3;
+    optional uint32 window_size = 4;
+    uint32 speculate = 5;
+}
+
+/// Empty request
+message ServiceDiscoveryRequest {}
+
+message ServiceDiscoveryResponse {
+    /// Other shards urls
+    repeated string urls = 1;
+}
+
+message ClearCacheRequest {
+    /// Optional batch id
+    optional uint64 id = 1;
+}
+
+/// Empty response
+message ClearCacheResponse {}
+
+message Image {
+    /// Binary image data.
+    bytes data = 1;
+
+    /// Image MIME type.
+    string mimetype = 2;
+}
+
+message InputChunk {
+    oneof chunk {
+        /// Plain text data
+        string text = 1;
+        /// Image data
+        Image image = 2;
+    }
+}
+
+message Input {
+    repeated InputChunk chunks = 1;
+  }
+
+enum GrammarType {
+    GRAMMAR_TYPE_NONE = 0;
+    GRAMMAR_TYPE_JSON = 1;
+    GRAMMAR_TYPE_REGEX = 2;
+}
+
+message NextTokenChooserParameters {
+    /// exponential scaling output probability distribution
+    float temperature = 1;
+    /// restricting to the k highest probability elements
+    uint32 top_k = 2;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float top_p = 3;
+    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    float typical_p = 4;
+    /// apply sampling on the logits
+    bool do_sample = 5;
+    /// random seed for sampling
+    uint64 seed = 6;
+    /// repetition penalty
+    float repetition_penalty = 7;
+    /// frequency penalty
+    float frequency_penalty = 9;
+    /// token watermarking using "A Watermark for Large Language Models"
+    bool watermark = 8;
+    /// grammar (applied if not empty)
+    string grammar = 10;
+    /// grammar type
+    GrammarType grammar_type = 11;
+}
+
+message StoppingCriteriaParameters {
+    /// Maximum number of generated tokens
+    uint32 max_new_tokens = 1;
+    /// Optional stopping sequences
+    repeated string stop_sequences = 2;
+    /// Ignore end of sequence token
+    /// used for benchmarking
+    bool ignore_eos_token = 3;
+}
+
+message Request {
+    /// Request ID
+    uint64 id = 1;
+    /// The generation context as chunks
+    Input input_chunks = 8;
+    /// The generation context, stringified input_chunks
+    string inputs = 2;
+    /// Context truncation
+    uint32 truncate = 3;
+    /// Next Token Chooser Parameters
+    NextTokenChooserParameters parameters = 4;
+    /// Stopping Criteria Parameters
+    StoppingCriteriaParameters stopping_parameters = 5;
+    /// Return prefill logprobs
+    bool prefill_logprobs = 6;
+    /// Return most likely n tokens
+    uint32 top_n_tokens = 7;
+}
+
+message Batch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests
+    repeated Request requests = 2;
+    /// Batch size (==len(requests))
+    uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
+}
+
+message CachedBatch {
+    /// Batch ID
+    uint64 id = 1;
+    /// Individual requests ids
+    repeated uint64 request_ids = 2;
+    /// Batch size (==len(requests))
+    uint32 size = 3;
+    /// Maximum number of tokens this batch will grow to
+    uint32 max_tokens = 4;
+}
+
+enum FinishReason {
+    FINISH_REASON_LENGTH = 0;
+    FINISH_REASON_EOS_TOKEN = 1;
+    FINISH_REASON_STOP_SEQUENCE = 2;
+}
+
+message GeneratedText {
+    /// Output
+    string text = 1;
+    /// Number of generated tokens
+    uint32 generated_tokens = 2;
+    /// Finish reason
+    FinishReason finish_reason = 3;
+    /// Seed
+    optional uint64 seed = 4;
+}
+
+message Tokens {
+    /// Token IDs
+    repeated uint32 ids = 1;
+    /// Logprobs
+    repeated float logprobs = 2;
+    /// tokens
+    repeated string texts = 3;
+    /// special
+    repeated bool is_special = 4;
+}
+
+message Generation {
+    /// Request ID
+    uint64 request_id = 1;
+    /// Prefill tokens (optional)
+    Tokens prefill_tokens = 2;
+    Tokens tokens = 3;
+    /// Complete generated text
+    optional GeneratedText generated_text = 4;
+    /// Top tokens
+    repeated Tokens top_tokens = 5;
+}
+
+message FilterBatchRequest {
+    /// Batch ID
+    uint64 batch_id = 1;
+    /// Requests to keep
+    repeated uint64 request_ids = 2;
+}
+
+message FilterBatchResponse {
+    /// Filtered Batch (cached)
+    CachedBatch batch = 1;
+}
+
+
+message PrefillRequest {
+    /// Batch
+    Batch batch = 1;
+}
+
+message PrefillResponse {
+    /// Generation
+    repeated Generation generations = 1;
+    /// Next batch (cached)
+    optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+}
+
+message DecodeRequest {
+    /// Cached batches
+    repeated CachedBatch batches = 1;
+}
+
+message DecodeResponse {
+    /// Decodes
+    repeated Generation generations = 1;
+    /// Next batch (cached)
+    optional CachedBatch batch = 2;
+    /// Forward elapsed time in nanoseconds
+    uint64 forward_ns = 3;
+    /// Decode elapsed time in nanoseconds
+    uint64 decode_ns = 4;
+    /// Total elapsed time in nanoseconds
+    uint64 total_ns = 5;
+    /// Concatenate elapsed time in nanoseconds
+    optional uint64 concat_ns = 6;
+}
+
+message WarmupRequest {
+    /// Batch to warmup on
+    Batch batch = 1;
+    uint32 max_input_length = 2;
+    uint32 max_prefill_tokens = 3;
+    uint32 max_total_tokens = 4;
+}
+
+message WarmupResponse {
+    /// Maximum number of tokens supported by the model
+    optional uint32 max_supported_total_tokens = 1;
+}
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@ -6,6 +6,7 @@ authors.workspace = true
 homepage.workspace = true

 [dependencies]
+async-trait = "^0.1"
 base64 = { workspace = true }
 futures = "^0.3"
 grpc-metadata = { path = "../grpc-metadata" }
--- a/router/client/build.rs
+++ b/router/client/build.rs
@ -1,19 +1,31 @@
 use std::fs;

 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("cargo:rerun-if-changed=../../proto/generate.proto");
-    fs::create_dir("src/pb").unwrap_or(());
+    println!("cargo:rerun-if-changed=../../proto/**");

+    fs::create_dir_all("src/v2/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");

    tonic_build::configure()
        .build_client(true)
        .build_server(false)
-        .out_dir("src/pb")
+        .out_dir("src/v2/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));

+    fs::create_dir_all("src/v3/pb").unwrap_or(());
+    let mut config = prost_build::Config::new();
+    config.protoc_arg("--experimental_allow_proto3_optional");
+
+    tonic_build::configure()
+        .build_client(true)
+        .build_server(false)
+        .out_dir("src/v3/pb")
+        .include_file("mod.rs")
+        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
+        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
+
    Ok(())
 }
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@ -1,25 +1,35 @@
 //! Text Generation gRPC client library

-mod client;
-#[allow(clippy::derive_partial_eq_without_eq)]
-mod pb;
-mod sharded_client;
+pub mod v2;
+pub mod v3;

+use async_trait::async_trait;
 use base64::{engine::general_purpose::STANDARD, Engine};
-pub use client::Client;
-pub use pb::generate::v2::input_chunk::Chunk;
-pub use pb::generate::v2::HealthResponse;
-pub use pb::generate::v2::Image;
-pub use pb::generate::v2::InfoResponse as ShardInfo;
-pub use pb::generate::v2::{
-    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, Input, InputChunk,
-    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
-};
-pub use sharded_client::ShardedClient;
 use thiserror::Error;
 use tonic::transport;
 use tonic::Status;

+pub use v3::{Chunk, Image, Input, InputChunk};
+
+#[async_trait]
+pub trait Health {
+    /// Check if a generate server is healthy by asking it to allocate a tensor on device
+    async fn device_health(&self) -> Result<()>;
+
+    /// Check if a generate server is healthy by doing a forward pass.
+    /// EXPENSIVE
+    async fn model_health(&self) -> Result<()>;
+}
+
+#[derive(Debug)]
+pub struct ShardInfo {
+    pub requires_padding: bool,
+    pub dtype: String,
+    pub device_type: String,
+    pub window_size: Option<u32>,
+    pub speculate: u32,
+}
+
 #[derive(Error, Debug, Clone)]
 pub enum ClientError {
    #[error("Could not connect to Text Generation server: {0}")]
@ -46,8 +56,6 @@ impl From<transport::Error> for ClientError {
    }
 }

-pub type Result<T> = std::result::Result<T, ClientError>;
-
 // Small convenience re-wrapping of `Chunk`.
 impl From<Chunk> for InputChunk {
    fn from(chunk: Chunk) -> Self {
@ -77,3 +85,7 @@ impl ChunksToString for Vec<InputChunk> {
        output
    }
 }
+
+static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
+
+pub type Result<T> = std::result::Result<T, ClientError>;
--- a/router/client/src/pb/.gitignore
+++ b/router/client/src/pb/.gitignore
@ -1 +0,0 @@
-*.rs
--- a/router/client/src/v2/client.rs
+++ b/router/client/src/v2/client.rs
@ -0,0 +1,258 @@
+/// Single shard Client
+use crate::v2::pb;
+use crate::{ClientError, Result};
+
+use crate::WARMUP_IMAGE_BASE64;
+use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v2::*;
+use std::cmp::min;
+use std::time::Duration;
+use tonic::transport::{Channel, Uri};
+use tracing::instrument;
+
+/// Text Generation Inference gRPC client
+#[derive(Debug, Clone)]
+pub struct Client {
+    stub: TextGenerationServiceClient<Channel>,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let channel = Channel::builder(uri).connect().await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let channel = Channel::from_shared("http://[::]:50051".to_string())
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                tokio::net::UnixStream::connect(path.clone())
+            }))
+            .await?;
+
+        Ok(Self {
+            stub: TextGenerationServiceClient::new(channel),
+        })
+    }
+
+    /// Returns a list of uris or unix sockets of all shards
+    #[instrument(skip(self))]
+    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
+        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v2 interface".to_string())
+        })?;
+        let urls = response
+            .into_inner()
+            .urls
+            .into_iter()
+            // Remove unix socket prefix
+            .map(|url| match url.strip_prefix("unix://") {
+                None => url,
+                Some(stripped_url) => stripped_url.to_string(),
+            })
+            .collect();
+        Ok(urls)
+    }
+
+    /// Get model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<InfoResponse> {
+        let request = tonic::Request::new(InfoRequest {}).inject_context();
+        let response = self.stub.info(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Get model health
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let request = tonic::Request::new(HealthRequest {}).inject_context();
+        let response = self.stub.health(request).await?.into_inner();
+        Ok(response)
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let request = tonic::Request::new(ClearCacheRequest { id: batch_id }).inject_context();
+        self.stub.clear_cache(request).await?;
+        Ok(())
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let request = tonic::Request::new(FilterBatchRequest {
+            batch_id,
+            request_ids,
+        })
+        .inject_context();
+        let filtered_batch = self.stub.filter_batch(request).await?.into_inner();
+        Ok(filtered_batch.batch)
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip_all)]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let mut n_tokens = 0;
+        let mut requests = Vec::new();
+        // Create requests
+        while n_tokens < max_prefill_tokens {
+            let truncate = min(max_input_length, max_prefill_tokens - n_tokens);
+
+            let mut inputs = String::new();
+            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
+            if n_tokens == 0 {
+                // 1 request is enough to test vision heads.
+                // Sending images on other queries messes up easily with truncation.
+                inputs.push_str(&format!(
+                    "![](data:image/jpeg;base64,{WARMUP_IMAGE_BASE64})",
+                ));
+            }
+
+            requests.push(Request {
+                id: 0,
+                inputs,
+                // We truncate the input on the server side to be sure that it has the correct size
+                truncate,
+                // Set sampling parameters to also take these ops into account in the max memory
+                parameters: Some(NextTokenChooserParameters {
+                    temperature: 0.9,
+                    top_k: 10,
+                    top_p: 0.9,
+                    typical_p: 0.9,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 1.2,
+                    frequency_penalty: 0.1,
+                    watermark: true,
+                    grammar: String::new(),
+                    grammar_type: GrammarType::None as i32,
+                }),
+                stopping_parameters: Some(StoppingCriteriaParameters {
+                    max_new_tokens: max_total_tokens - truncate,
+                    stop_sequences: vec![],
+                    ignore_eos_token: true,
+                }),
+                prefill_logprobs: true,
+                top_n_tokens: 20,
+            });
+            n_tokens += max_input_length;
+
+            // Check max_batch_size
+            if Some(requests.len()) == max_batch_size {
+                break;
+            }
+        }
+
+        let batch = Batch {
+            id: 0,
+            size: requests.len() as u32,
+            requests,
+            max_tokens: 0,
+        };
+
+        let request = tonic::Request::new(WarmupRequest {
+            batch: Some(batch),
+            max_input_length,
+            max_prefill_tokens,
+            max_total_tokens,
+        })
+        .inject_context();
+        let response = self.stub.warmup(request).await?.into_inner();
+        Ok(response.max_supported_total_tokens)
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = &batch.id, size = &batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let response = self.stub.prefill(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+        ))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(|batch|{batch.size}).sum::<u32>()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let request = tonic::Request::new(DecodeRequest { batches }).inject_context();
+        let response = self.stub.decode(request).await?.into_inner();
+        Ok((
+            response.generations,
+            response.batch,
+            DecodeTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
+        ))
+    }
+}
+
+pub struct PrefillTimings {
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl PrefillTimings {
+    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
+
+pub struct DecodeTimings {
+    pub concat: Option<Duration>,
+    pub forward: Duration,
+    pub decode: Duration,
+    pub total: Duration,
+}
+
+impl DecodeTimings {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+        Self {
+            concat: concat_ns.map(Duration::from_nanos),
+            forward: Duration::from_nanos(forward_ns),
+            decode: Duration::from_nanos(decode_ns),
+            total: Duration::from_nanos(total_ns),
+        }
+    }
+}
--- a/router/client/src/v2/mod.rs
+++ b/router/client/src/v2/mod.rs
@ -0,0 +1,13 @@
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod client;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v2::HealthResponse;
+pub use pb::generate::v2::{
+    Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType, InfoResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
--- a/router/client/src/v2/pb/.gitignore
+++ b/router/client/src/v2/pb/.gitignore
@ -0,0 +1 @@
+*
--- a/router/client/src/v2/sharded_client.rs
+++ b/router/client/src/v2/sharded_client.rs
@ -1,10 +1,17 @@
-use crate::client::{DecodeTimings, PrefillTimings};
 /// Multi shard Client
-use crate::{Batch, CachedBatch, Client, Generation, HealthResponse, ShardInfo};
+use crate::{v2, Health, ShardInfo};
 use crate::{ClientError, Result};
+
+use crate::v2::InfoResponse;
+use async_trait::async_trait;
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
+use v2::client::{DecodeTimings, PrefillTimings};
+use v2::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};

 #[derive(Debug, Clone)]
 /// Text Generation Inference gRPC multi client
@ -47,7 +54,7 @@ impl ShardedClient {
            .iter_mut()
            .map(|client| client.info())
            .collect();
-        join_all(futures).await.pop().unwrap()
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
    }

    /// GRPC health check
@ -185,3 +192,60 @@ impl ShardedClient {
        Ok((generations, next_batch, timings))
    }
 }
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
--- a/router/client/src/v3/client.rs
+++ b/router/client/src/v3/client.rs
@ -1,17 +1,16 @@
+use crate::v3::{pb, Chunk};
+use crate::{ClientError, Result, WARMUP_IMAGE_BASE64};
 /// Single shard Client
-use crate::pb::generate::v2::text_generation_service_client::TextGenerationServiceClient;
-use crate::pb::generate::v2::*;
-use crate::{Chunk, Result};
 use base64::engine::general_purpose::STANDARD;
 use base64::Engine;
 use grpc_metadata::InjectTelemetryContext;
+use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
+use pb::generate::v3::*;
 use std::cmp::min;
 use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;

-static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
-
 /// Text Generation Inference gRPC client
 #[derive(Debug, Clone)]
 pub struct Client {
@ -46,7 +45,9 @@ impl Client {
    #[instrument(skip(self))]
    pub async fn service_discovery(&mut self) -> Result<Vec<String>> {
        let request = tonic::Request::new(ServiceDiscoveryRequest {}).inject_context();
-        let response = self.stub.service_discovery(request).await?;
+        let response = self.stub.service_discovery(request).await.map_err(|_| {
+            ClientError::Connection("Server does not support v3 interface".to_string())
+        })?;
        let urls = response
            .into_inner()
            .urls
@ -133,6 +134,7 @@ impl Client {

            // Send stringly-typed inputs for compatibility for backends that haven't
            // been updated to support chunks.
+
            let mut inputs = String::new();
            inputs.push_str(&"_test ".to_string().repeat(max_input_length as usize));
            if n_tokens == 0 {
@ -145,10 +147,10 @@ impl Client {

            requests.push(Request {
                id: 0,
+                inputs,
                input_chunks: Some(Input {
                    chunks: input_chunks,
                }),
-                inputs,
                // We truncate the input on the server side to be sure that it has the correct size
                truncate,
                // Set sampling parameters to also take these ops into account in the max memory
--- a/router/client/src/v3/mod.rs
+++ b/router/client/src/v3/mod.rs
@ -0,0 +1,13 @@
+#[allow(clippy::derive_partial_eq_without_eq)]
+mod pb;
+
+mod client;
+mod sharded_client;
+
+pub use client::Client;
+pub use pb::generate::v3::{
+    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
+    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
+    StoppingCriteriaParameters, Tokens,
+};
+pub use sharded_client::ShardedClient;
--- a/router/client/src/v3/pb/.gitignore
+++ b/router/client/src/v3/pb/.gitignore
@ -0,0 +1 @@
+*
--- a/router/client/src/v3/sharded_client.rs
+++ b/router/client/src/v3/sharded_client.rs
@ -0,0 +1,254 @@
+/// Multi shard Client
+use crate::{v3, Health, ShardInfo};
+use crate::{ClientError, Result};
+
+use crate::v3::{Chunk, InfoResponse, Input};
+use async_trait::async_trait;
+use futures::future::join_all;
+use tonic::transport::Uri;
+use tracing::instrument;
+use v3::client::{DecodeTimings, PrefillTimings};
+use v3::{
+    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
+    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+
+#[derive(Debug, Clone)]
+/// Text Generation Inference gRPC multi client
+pub struct ShardedClient {
+    clients: Vec<Client>,
+}
+
+impl ShardedClient {
+    fn new(clients: Vec<Client>) -> Self {
+        Self { clients }
+    }
+
+    /// Create a new ShardedClient from a master client. The master client will communicate with
+    /// the other shards and returns all uris/unix sockets with the `service_discovery` gRPC method.
+    async fn from_master_client(mut master_client: Client) -> Result<Self> {
+        // Get all uris/unix sockets from the master client
+        let uris = master_client.service_discovery().await?;
+        let futures = uris.into_iter().map(Client::connect_uds);
+        let clients: Result<Vec<Client>> = join_all(futures).await.into_iter().collect();
+        Ok(Self::new(clients?))
+    }
+
+    /// Returns a client connected to the given uri
+    pub async fn connect(uri: Uri) -> Result<Self> {
+        let master_client = Client::connect(uri).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub async fn connect_uds(path: String) -> Result<Self> {
+        let master_client = Client::connect_uds(path).await?;
+        Self::from_master_client(master_client).await
+    }
+
+    /// Get the model info
+    #[instrument(skip(self))]
+    pub async fn info(&mut self) -> Result<ShardInfo> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.info())
+            .collect();
+        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+    }
+
+    /// GRPC health check
+    #[instrument(skip(self))]
+    pub async fn health(&mut self) -> Result<HealthResponse> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.health())
+            .collect();
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Clear the past generations cache
+    #[instrument(skip(self))]
+    pub async fn clear_cache(&mut self, batch_id: Option<u64>) -> Result<()> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| client.clear_cache(batch_id))
+            .collect();
+        join_all(futures).await.into_iter().collect()
+    }
+
+    /// Filter a cached batch
+    #[instrument(skip(self))]
+    pub async fn filter_batch(
+        &mut self,
+        batch_id: u64,
+        request_ids: Vec<u64>,
+    ) -> Result<Option<CachedBatch>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.filter_batch(batch_id, request_ids.clone())))
+            .collect();
+        // all shards return the same message
+        join_all(futures).await.pop().unwrap()
+    }
+
+    /// Warmup on a max size batch
+    ///
+    /// Returns the maximum amount of tokens supported by the hardware
+    #[instrument(skip(self))]
+    pub async fn warmup(
+        &mut self,
+        max_input_length: u32,
+        max_prefill_tokens: u32,
+        max_total_tokens: u32,
+        max_batch_size: Option<usize>,
+    ) -> Result<Option<u32>> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| {
+                Box::pin(client.warmup(
+                    max_input_length,
+                    max_prefill_tokens,
+                    max_total_tokens,
+                    max_batch_size,
+                ))
+            })
+            .collect();
+        // Take the minimum value
+        let results = join_all(futures)
+            .await
+            .into_iter()
+            .collect::<Result<Vec<Option<u32>>>>()?;
+        Ok(results.into_iter().flatten().min())
+    }
+
+    /// Generate one token for each request in the given batch
+    ///
+    /// Returns Generation for each request in batch
+    /// and the next cached batch
+    #[instrument(skip_all, fields(id = & batch.id, size = & batch.size))]
+    pub async fn prefill(
+        &mut self,
+        batch: Batch,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+
+    /// Generate one token for each request in the given cached batches
+    ///
+    /// Returns Generation for each request in batches
+    /// and the next cached batch
+    #[instrument(skip_all, fields(size = batches.iter().map(| batch | {batch.size}).sum::< u32 > ()))]
+    pub async fn decode(
+        &mut self,
+        batches: Vec<CachedBatch>,
+    ) -> Result<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)> {
+        let futures: Vec<_> = self
+            .clients
+            .iter_mut()
+            .map(|client| Box::pin(client.decode(batches.clone())))
+            .collect();
+        #[allow(clippy::type_complexity)]
+        let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, DecodeTimings)>> =
+            join_all(futures).await.into_iter().collect();
+        let mut results = results?;
+
+        let (mut generations, next_batch, mut timings) =
+            results.pop().ok_or(ClientError::EmptyResults)?;
+
+        // Merge generations from different model shards
+        for (mut shard_generations, _, shard_timings) in results.into_iter() {
+            generations.append(&mut shard_generations);
+            // Return the timings of the slowest shard
+            if shard_timings.total > timings.total {
+                timings = shard_timings;
+            }
+        }
+        Ok((generations, next_batch, timings))
+    }
+}
+
+impl From<InfoResponse> for ShardInfo {
+    fn from(value: InfoResponse) -> Self {
+        Self {
+            requires_padding: value.requires_padding,
+            dtype: value.dtype,
+            device_type: value.device_type,
+            window_size: value.window_size,
+            speculate: value.speculate,
+        }
+    }
+}
+
+#[async_trait]
+impl Health for ShardedClient {
+    async fn device_health(&self) -> Result<()> {
+        self.clone().health().await?;
+        Ok(())
+    }
+
+    async fn model_health(&self) -> Result<()> {
+        // Dummy batch of 1 token and 1 generated token
+        let liveness_request = Request {
+            id: u64::MAX,
+            inputs: "liveness".to_string(),
+            input_chunks: Some(Input {
+                chunks: vec![Chunk::Text("liveness".into()).into()],
+            }),
+            truncate: 10,
+            prefill_logprobs: false,
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                frequency_penalty: 0.0,
+                watermark: false,
+                grammar: String::new(),
+                grammar_type: GrammarType::None as i32,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: 1,
+                stop_sequences: vec![],
+                ignore_eos_token: false,
+            }),
+            top_n_tokens: 0,
+        };
+        let batch = Batch {
+            id: u64::MAX,
+            requests: vec![liveness_request],
+            size: 1,
+            max_tokens: 2,
+        };
+        self.clone().prefill(batch).await?;
+        Ok(())
+    }
+}
--- a/router/src/health.rs
+++ b/router/src/health.rs
@ -1,75 +0,0 @@
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-use text_generation_client::{
-    Batch, Input, NextTokenChooserParameters, Request, ShardedClient, StoppingCriteriaParameters,
-};
-use text_generation_client::{Chunk, GrammarType as ProtoGrammarType};
-
-// Note: Request ids and batch ids cannot collide.
-const LIVENESS_ID: u64 = u64::MAX;
-const BATCH_ID: u64 = u64::MAX;
-
-#[derive(Clone, Debug)]
-pub(crate) struct Health {
-    client: ShardedClient,
-    generation_health: Arc<AtomicBool>,
-}
-
-impl Health {
-    pub(crate) fn new(client: ShardedClient, generation_health: Arc<AtomicBool>) -> Self {
-        Self {
-            client,
-            generation_health,
-        }
-    }
-
-    pub(crate) async fn check(&mut self) -> bool {
-        if self.generation_health.load(Ordering::SeqCst) {
-            // Generation is healthy, we only check that the shards are answering gRPC calls
-            self.client.health().await.is_ok()
-        } else {
-            // Generation is unhealthy or have not sent any generation request yet
-
-            // Dummy batch of 1 token and 1 generated token
-            let liveness_request = Request {
-                id: LIVENESS_ID,
-                input_chunks: Some(Input {
-                    chunks: vec![Chunk::Text("liveness".into()).into()],
-                }),
-                inputs: "liveness".to_string(),
-                truncate: 10,
-                prefill_logprobs: false,
-                parameters: Some(NextTokenChooserParameters {
-                    temperature: 1.0,
-                    top_k: 0,
-                    top_p: 1.0,
-                    typical_p: 1.0,
-                    do_sample: false,
-                    seed: 0,
-                    repetition_penalty: 1.0,
-                    frequency_penalty: 0.0,
-                    watermark: false,
-                    grammar: String::new(),
-                    grammar_type: ProtoGrammarType::None as i32,
-                }),
-                stopping_parameters: Some(StoppingCriteriaParameters {
-                    max_new_tokens: 1,
-                    stop_sequences: vec![],
-                    ignore_eos_token: false,
-                }),
-                top_n_tokens: 0,
-            };
-            let batch = Batch {
-                id: BATCH_ID,
-                requests: vec![liveness_request],
-                size: 1,
-                max_tokens: 2,
-            };
-            // Skips the queue
-            let value = self.client.prefill(batch).await.is_ok();
-            // Update generation health
-            self.generation_health.store(value, Ordering::SeqCst);
-            value
-        }
-    }
-}
--- a/router/src/infer/health.rs
+++ b/router/src/infer/health.rs
@ -0,0 +1,34 @@
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use text_generation_client::Health;
+
+#[derive(Clone)]
+pub(crate) struct HealthCheck {
+    client: Arc<dyn Health + Send + Sync>,
+    generation_health: Arc<AtomicBool>,
+}
+
+impl HealthCheck {
+    pub(crate) fn new(
+        client: Arc<dyn Health + Send + Sync>,
+        generation_health: Arc<AtomicBool>,
+    ) -> Self {
+        Self {
+            client,
+            generation_health,
+        }
+    }
+
+    pub(crate) async fn check(&mut self) -> bool {
+        let value = if self.generation_health.load(Ordering::SeqCst) {
+            // Generation is healthy, we only check that the shards can allocate on device
+            self.client.device_health().await
+        } else {
+            self.client.model_health().await
+        }
+        .is_ok();
+        // Update generation health
+        self.generation_health.store(value, Ordering::SeqCst);
+        value
+    }
+}
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -0,0 +1,522 @@
+mod health;
+pub(crate) mod v2;
+pub(crate) mod v3;
+
+pub(crate) use health::HealthCheck;
+
+use crate::validation::{ValidGenerateRequest, Validation, ValidationError};
+use crate::{
+    ChatTemplateInputs, ChatTemplateVersions, FinishReason, GenerateRequest, HubProcessorConfig,
+    HubTokenizerConfig, Message, MessageChunk, PrefillToken, Text, TextMessage, Token,
+};
+use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools};
+use futures::future::try_join_all;
+use minijinja::{Environment, ErrorKind, Template};
+use serde_json::{json, Map, Value};
+use std::collections::HashMap;
+use std::sync::Arc;
+use thiserror::Error;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, TryAcquireError};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tokio_stream::StreamExt;
+use tracing::instrument;
+
+pub(crate) trait Scheduler {
+    fn schedule(
+        &self,
+        request: ValidGenerateRequest,
+        permit: OwnedSemaphorePermit,
+    ) -> Result<GenerateStreamResponse, InferError>;
+}
+
+/// Inference struct
+#[derive(Clone)]
+pub struct Infer {
+    /// Validation
+    validation: Validation,
+    /// Request scheduler
+    scheduler: Arc<dyn Scheduler + Send + Sync>,
+    /// Chat template
+    chat_template: Option<ChatTemplate>,
+    /// Inference limit
+    limit_concurrent_requests: Arc<Semaphore>,
+}
+
+impl Infer {
+    #[allow(clippy::too_many_arguments)]
+    pub(crate) fn new(
+        scheduler: Arc<dyn Scheduler + Send + Sync>,
+        validation: Validation,
+        max_concurrent_requests: usize,
+        tokenizer_config: HubTokenizerConfig,
+        processor_config: HubProcessorConfig,
+    ) -> Self {
+        let chat_template = tokenizer_config
+            .chat_template
+            .or(processor_config.chat_template)
+            .and_then(|t| match t {
+                ChatTemplateVersions::Single(template) => Some(template),
+                ChatTemplateVersions::Multiple(templates) => templates
+                    .into_iter()
+                    .find(|t| t.name == "default")
+                    .map(|t| t.template),
+            })
+            .map(|t| {
+                // .strip() is not supported in minijinja
+                // .capitalize() is not supported in minijinja but we can use | capitalize
+                let t = t
+                    .replace(".strip()", " | trim")
+                    .replace(".capitalize()", " | capitalize");
+                ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token)
+            });
+
+        // Inference limit with a semaphore
+        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
+
+        Self {
+            validation,
+            scheduler,
+            chat_template,
+            limit_concurrent_requests: semaphore,
+        }
+    }
+
+    /// Add a new request to the queue and return a stream of InferStreamResponse
+    #[instrument(skip_all)]
+    pub(crate) async fn generate_stream(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<GenerateStreamResponse, InferError> {
+        // Limit concurrent requests by acquiring a permit from the semaphore
+        let permit = self
+            .clone()
+            .limit_concurrent_requests
+            .try_acquire_owned()
+            .map_err(|err| {
+                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
+                tracing::error!("{err}");
+                err
+            })?;
+
+        // Validate request
+        let valid_request = self.validation.validate(request).await.map_err(|err| {
+            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
+            tracing::error!("{err}");
+            err
+        })?;
+
+        self.scheduler.schedule(valid_request, permit)
+    }
+
+    /// Tokenizer the input
+    #[instrument(skip_all)]
+    pub(crate) async fn tokenize(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<Option<tokenizers::Encoding>, InferError> {
+        // Tokenize request
+        let inputs = request.inputs;
+        let truncate = request.parameters.truncate;
+        let encoding = self
+            .validation
+            .tokenize(inputs, truncate)
+            .await
+            .map_err(|err| {
+                tracing::error!("Tokenization {err}");
+                err
+            })?;
+
+        // Return Encoding
+        Ok(encoding.map(|(encoding, _)| encoding))
+    }
+
+    /// Apply the chat template to the chat request
+    #[instrument(skip_all)]
+    pub(crate) fn apply_chat_template(
+        &self,
+        messages: Vec<Message>,
+        grammar_with_prompt: Option<(GrammarType, String)>,
+    ) -> Result<String, InferError> {
+        self.chat_template
+            .as_ref()
+            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
+            .apply(messages, grammar_with_prompt)
+            .map_err(|e| {
+                metrics::increment_counter!("tgi_request_failure", "err" => "template");
+                tracing::error!("{e}");
+                e
+            })
+    }
+
+    /// Add a new request to the queue and return a InferResponse
+    #[instrument(skip_all)]
+    pub(crate) async fn generate(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<InferResponse, InferError> {
+        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);
+
+        // Create stream and keep semaphore permit as long as generate lives
+        let (_permit, _input_length, mut stream) = self.generate_stream(request).await?;
+
+        // Return values
+        let mut result_prefill = Vec::new();
+        let mut result_tokens = Vec::new();
+        let mut result_top_tokens = Vec::new();
+        let mut result_generated_text = None;
+        let mut result_start = None;
+        let mut result_queued = None;
+
+        // Iterate on stream
+        while let Some(response) = stream.next().await {
+            match response? {
+                // Add prefill tokens
+                InferStreamResponse::Prefill(prefill_tokens) => {
+                    result_prefill = prefill_tokens;
+                }
+                // Push last token
+                InferStreamResponse::Intermediate { token, top_tokens } => {
+                    result_tokens.push(token);
+                    result_top_tokens.push(top_tokens);
+                }
+                // Final message
+                // Set return values
+                InferStreamResponse::End {
+                    token,
+                    generated_text,
+                    start,
+                    queued,
+                    top_tokens,
+                } => {
+                    result_tokens.push(token);
+                    result_top_tokens.push(top_tokens);
+                    result_generated_text = Some(generated_text);
+                    result_start = Some(start);
+                    result_queued = Some(queued)
+                }
+            }
+        }
+
+        // Check that we received a `InferStreamResponse::End` message
+        if let (Some(generated_text), Some(queued), Some(start)) =
+            (result_generated_text, result_queued, result_start)
+        {
+            Ok(InferResponse {
+                prefill: result_prefill,
+                _input_length,
+                tokens: result_tokens,
+                generated_text,
+                queued,
+                start,
+                top_tokens: if use_top_tokens {
+                    result_top_tokens
+                } else {
+                    Vec::new()
+                },
+            })
+        } else {
+            let err = InferError::IncompleteGeneration;
+            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
+            tracing::error!("{err}");
+            Err(err)
+        }
+    }
+    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
+    /// the highest log probability per token
+    #[instrument(skip(self, request))]
+    pub(crate) async fn generate_best_of(
+        &self,
+        request: GenerateRequest,
+        best_of: usize,
+    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
+        // validate  best_of parameter separately
+        let best_of = self.validation.validate_best_of(best_of)?;
+
+        // create multiple generate requests
+        let mut infer_responses: Vec<InferResponse> =
+            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
+
+        // get the sequence with the highest log probability per token
+        let mut max_index = 0;
+        let mut max_logprob: f32 = f32::MIN;
+
+        for (i, response) in infer_responses.iter().enumerate() {
+            // mean logprobs of the generated tokens
+            let sequence_logprob = response
+                .tokens
+                .iter()
+                .map(|token| token.logprob)
+                .sum::<f32>()
+                / response.tokens.len() as f32;
+
+            // set best sequence
+            if sequence_logprob > max_logprob {
+                max_index = i;
+                max_logprob = sequence_logprob;
+            }
+        }
+        let best_response = infer_responses.remove(max_index);
+        Ok((best_response, infer_responses))
+    }
+}
+
+/// Raise a exception (custom function) used in the chat templates
+fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
+    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
+}
+
+#[derive(Clone)]
+struct ChatTemplate {
+    template: Template<'static, 'static>,
+    bos_token: Option<String>,
+    eos_token: Option<String>,
+    use_default_tool_template: bool,
+}
+
+impl ChatTemplate {
+    fn new(template: String, bos_token: Option<String>, eos_token: Option<String>) -> Self {
+        let mut env = Box::new(Environment::new());
+        let template_str = template.into_boxed_str();
+        env.add_function("raise_exception", raise_exception);
+
+        // check if contains the tools variable within the template
+        let use_default_tool_template =
+            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
+        // leaking env and template_str as read-only, static resources for performance.
+        let template = Box::leak(env)
+            .template_from_str(Box::leak(template_str))
+            .unwrap();
+
+        Self {
+            template,
+            bos_token,
+            eos_token,
+            use_default_tool_template,
+        }
+    }
+
+    fn apply(
+        &self,
+        mut messages: Vec<Message>,
+        grammar_with_prompt: Option<(GrammarType, String)>,
+    ) -> Result<String, InferError> {
+        if self.use_default_tool_template {
+            if let Some(last_message) = messages.last_mut() {
+                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
+                    last_message.content.push(MessageChunk::Text(Text {
+                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
+                    }));
+                }
+            }
+        }
+
+        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
+
+        self.template
+            .render(ChatTemplateInputs {
+                messages,
+                bos_token: self.bos_token.as_deref(),
+                eos_token: self.eos_token.as_deref(),
+                add_generation_prompt: true,
+                tools: None,
+                tools_prompt: None,
+            })
+            .map_err(InferError::TemplateError)
+    }
+}
+
+pub struct ToolGrammar {}
+
+impl ToolGrammar {
+    pub fn apply(
+        tools: Option<Vec<Tool>>,
+        tool_choice: Option<ToolType>,
+    ) -> Result<Option<Tools>, InferError> {
+        if let Some((req_tools, tool_choice)) = tools.zip(tool_choice) {
+            // let tool_prompt = tool_prompt.unwrap_or_default();
+            let tools_to_use = match tool_choice {
+                ToolType::FunctionName(name) => {
+                    vec![req_tools
+                        .iter()
+                        .find(|tool| tool.function.name == *name)
+                        .unwrap_or_else(|| panic!("Tool with name {} not found", name))
+                        .clone()]
+                }
+                ToolType::OneOf => req_tools.to_owned(),
+            };
+
+            // adds the error notification function for LLM feedback if required
+            let mut text_response_properties = Map::new();
+            text_response_properties.insert(
+                "error".to_string(),
+                serde_json::json!({
+                    "type": "string",
+                    "description": "The error or issue to notify"
+                }),
+            );
+            text_response_properties.insert(
+                "_name".to_string(),
+                serde_json::json!({
+                    "type": "string",
+                    "const": "notify_error"
+                }),
+            );
+
+            let functions: HashMap<String, serde_json::Value> = tools_to_use
+                .iter()
+                .map(|tool| {
+                    let func = tool.function.clone();
+
+                    // Clone the existing parameters, which are expected to be a JSON object
+                    let mut params = if let Value::Object(params) = &func.arguments {
+                        params.clone()
+                    } else {
+                        Map::new()
+                    };
+
+                    // Insert the function's description at the top level, outside of properties
+                    params.insert(
+                        "description".to_string(),
+                        Value::String(func.description.clone().unwrap_or_default()),
+                    );
+
+                    // Ensure 'properties' exists and is an object
+                    let properties = params
+                        .entry("properties".to_string())
+                        .or_insert_with(|| json!({}))
+                        .as_object_mut()
+                        .unwrap();
+
+                    // Insert the constant for the function name inside 'properties'
+                    properties.insert(
+                        "_name".to_string(),
+                        json!({
+                            "type": "string",
+                            "const": func.name.clone(),
+                            // "description": "The name of the function"
+                        }),
+                    );
+
+                    // Check if 'required' exists, and it is an array. If not, create an empty array.
+                    let required = params
+                        .entry("required".to_string())
+                        .or_insert_with(|| json!([]))
+                        .as_array_mut()
+                        .unwrap();
+
+                    // Add 'name' to the 'required' array if it is not already present
+                    if !required.iter().any(|r| r == "_name") {
+                        required.push(json!("_name"));
+                    }
+
+                    (func.name, Value::Object(params))
+                })
+                .chain([(
+                    "notify_error".to_string(),
+                    serde_json::json!({
+                        "properties": text_response_properties,
+                        "required": ["error", "_name"],
+                        "type": "object"
+                    }),
+                )])
+                .collect();
+
+            let tools = Tools {
+                functions_map: FunctionsMap { functions },
+                properties: Properties {
+                    function: tools_to_use
+                        .iter()
+                        .map(|tool| FunctionRef {
+                            ref_path: format!("#/$functions/{}", tool.function.name.clone()),
+                        })
+                        .chain(std::iter::once(FunctionRef {
+                            ref_path: "#/$functions/notify_error".to_string(),
+                        }))
+                        .collect(),
+                },
+            };
+
+            return Ok(Some(tools));
+        }
+        // Err(InferError::ToolError("No tools provided".to_string()))
+        Ok(None)
+    }
+}
+
+/// Type alias for generation responses
+pub(crate) type GenerateStreamResponse = (
+    OwnedSemaphorePermit,
+    u32, // input_length
+    UnboundedReceiverStream<Result<InferStreamResponse, InferError>>,
+);
+
+#[derive(Debug)]
+pub(crate) struct GeneratedText {
+    pub(crate) text: String,
+    pub(crate) generated_tokens: u32,
+    pub(crate) finish_reason: FinishReason,
+    pub(crate) seed: Option<u64>,
+}
+
+#[derive(Debug)]
+pub(crate) enum InferStreamResponse {
+    // Optional first message
+    Prefill(Vec<PrefillToken>),
+    // Intermediate messages
+    Intermediate {
+        token: Token,
+        top_tokens: Vec<Token>,
+    },
+    // Last message
+    End {
+        token: Token,
+        top_tokens: Vec<Token>,
+        generated_text: GeneratedText,
+        start: Instant,
+        queued: Instant,
+    },
+}
+
+#[derive(Debug)]
+pub(crate) struct InferResponse {
+    /// input_length is the input as perceived by the rust tokenizer in the
+    /// validation pathway. It is redundant with prefill.len() but prefill
+    /// has data only if the user asked for it. This will always be filled.
+    pub(crate) _input_length: u32,
+    pub(crate) prefill: Vec<PrefillToken>,
+    pub(crate) tokens: Vec<Token>,
+    pub(crate) generated_text: GeneratedText,
+    pub(crate) queued: Instant,
+    pub(crate) start: Instant,
+    pub(crate) top_tokens: Vec<Vec<Token>>,
+}
+
+#[derive(Debug, Error)]
+pub enum InferError {
+    #[error("Request failed during generation: {0}")]
+    GenerationError(String),
+    #[error("Model is overloaded")]
+    Overloaded(#[from] TryAcquireError),
+    #[error("Input validation error: {0}")]
+    ValidationError(#[from] ValidationError),
+    #[error("Incomplete generation")]
+    IncompleteGeneration,
+    #[error("Template error: {0}")]
+    TemplateError(#[from] minijinja::Error),
+    #[error("Tool error: {0}")]
+    ToolError(String),
+}
+
+impl InferError {
+    pub(crate) fn error_type(&self) -> &str {
+        match self {
+            InferError::GenerationError(_) => "generation",
+            InferError::Overloaded(_) => "overloaded",
+            InferError::ValidationError(_) => "validation",
+            InferError::IncompleteGeneration => "incomplete_generation",
+            InferError::TemplateError(_) => "template_error",
+            InferError::ToolError(_) => "tool_error",
+        }
+    }
+}
--- a/router/src/infer/v2/mod.rs
+++ b/router/src/infer/v2/mod.rs
@ -0,0 +1,4 @@
+mod queue;
+mod scheduler;
+
+pub(crate) use scheduler::SchedulerV2;
--- a/router/src/infer/v2/queue.rs
+++ b/router/src/infer/v2/queue.rs
@ -0,0 +1,667 @@
+use crate::infer::{InferError, InferStreamResponse};
+use crate::validation::{
+    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+};
+use nohash_hasher::{BuildNoHashHasher, IntMap};
+use std::cmp::min;
+use std::collections::VecDeque;
+use text_generation_client::v2::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use text_generation_client::ChunksToString;
+use tokio::sync::{mpsc, oneshot};
+use tokio::time::Instant;
+use tracing::{info_span, instrument, Span};
+
+/// Queue entry
+#[derive(Debug)]
+pub(crate) struct Entry {
+    /// Request
+    pub request: ValidGenerateRequest,
+    /// Response sender to communicate between the Infer struct and the batching_task
+    pub response_tx: mpsc::UnboundedSender<Result<InferStreamResponse, InferError>>,
+    /// Span that will live as long as entry
+    pub span: Span,
+    /// Temporary span used as a guard when logging inference, wait times...
+    pub temp_span: Option<Span>,
+    /// Instant when this entry was queued
+    pub queue_time: Instant,
+    /// Instant when this entry was added to a batch
+    pub batch_time: Option<Instant>,
+}
+
+/// Request Queue
+#[derive(Debug, Clone)]
+pub(crate) struct Queue {
+    /// Channel to communicate with the background queue task
+    queue_sender: mpsc::UnboundedSender<QueueCommand>,
+}
+
+impl Queue {
+    pub(crate) fn new(
+        requires_padding: bool,
+        block_size: u32,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
+        // Create channel
+        let (queue_sender, queue_receiver) = mpsc::unbounded_channel();
+
+        // Launch background queue task
+        tokio::spawn(queue_task(
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+            queue_receiver,
+        ));
+
+        Self { queue_sender }
+    }
+
+    #[instrument(skip_all)]
+    pub(crate) fn append(&self, entry: Entry) {
+        // Send append command to the background task managing the state
+        // Unwrap is safe here
+        self.queue_sender
+            .send(QueueCommand::Append(Box::new(entry), Span::current()))
+            .unwrap();
+    }
+
+    // Get the next batch
+    #[instrument(skip(self))]
+    pub(crate) async fn next_batch(
+        &self,
+        min_size: Option<usize>,
+        max_size: Option<usize>,
+        prefill_token_budget: u32,
+        token_budget: u32,
+    ) -> Option<NextBatch> {
+        // Create response channel
+        let (response_sender, response_receiver) = oneshot::channel();
+        // Send next batch command to the background task managing the state
+        // Unwrap is safe here
+        self.queue_sender
+            .send(QueueCommand::NextBatch {
+                min_size,
+                max_size,
+                prefill_token_budget,
+                token_budget,
+                response_sender,
+                span: Span::current(),
+            })
+            .unwrap();
+        // Await on response channel
+        // Unwrap is safe here
+        response_receiver.await.unwrap()
+    }
+}
+
+// Background task responsible of the queue state
+async fn queue_task(
+    requires_padding: bool,
+    block_size: u32,
+    window_size: Option<u32>,
+    speculate: u32,
+    mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
+) {
+    let mut state = State::new(requires_padding, block_size, window_size, speculate);
+
+    while let Some(cmd) = receiver.recv().await {
+        match cmd {
+            QueueCommand::Append(entry, span) => {
+                span.in_scope(|| state.append(*entry));
+                metrics::increment_gauge!("tgi_queue_size", 1.0);
+            }
+            QueueCommand::NextBatch {
+                min_size,
+                max_size,
+                prefill_token_budget,
+                token_budget,
+                response_sender,
+                span,
+            } => span.in_scope(|| {
+                let next_batch =
+                    state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
+                response_sender.send(next_batch).unwrap();
+                metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
+            }),
+        }
+    }
+}
+
+/// Queue State
+#[derive(Debug)]
+struct State {
+    /// Queue entries organized in a Vec
+    entries: VecDeque<(u64, Entry)>,
+
+    /// Id of the next entry
+    next_id: u64,
+
+    /// Id of the next batch
+    next_batch_id: u64,
+
+    /// Whether the model is using padding
+    requires_padding: bool,
+
+    /// Paged Attention block size
+    block_size: u32,
+
+    /// Sliding window
+    window_size: Option<u32>,
+
+    /// Speculation amount
+    speculate: u32,
+}
+
+impl State {
+    fn new(
+        requires_padding: bool,
+        block_size: u32,
+        window_size: Option<u32>,
+        speculate: u32,
+    ) -> Self {
+        Self {
+            entries: VecDeque::with_capacity(128),
+            next_id: 0,
+            next_batch_id: 0,
+            requires_padding,
+            block_size,
+            window_size,
+            speculate,
+        }
+    }
+
+    /// Append an entry to the queue
+    fn append(&mut self, mut entry: Entry) {
+        // Create a span that will live as long as the entry is in the queue waiting to be batched
+        let queue_span = info_span!(parent: &entry.span, "queued");
+        entry.temp_span = Some(queue_span);
+
+        // Push entry in the queue
+        self.entries.push_back((self.next_id, entry));
+        self.next_id += 1;
+    }
+
+    // Get the next batch
+    fn next_batch(
+        &mut self,
+        min_size: Option<usize>,
+        max_size: Option<usize>,
+        prefill_token_budget: u32,
+        token_budget: u32,
+    ) -> Option<NextBatch> {
+        if self.entries.is_empty() {
+            tracing::debug!("No queue");
+            return None;
+        }
+
+        // Check if we have enough entries
+        if let Some(min_size) = min_size {
+            if self.entries.len() < min_size {
+                tracing::debug!("Not enough entries");
+                return None;
+            }
+        }
+
+        // Pad prefill_token_budget to be a multiple of block size
+        let prefill_token_budget =
+            ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
+
+        // Create span for this batch to add context to inference calls
+        let next_batch_span = info_span!(parent: None, "batch", batch_size = tracing::field::Empty);
+        next_batch_span.follows_from(&Span::current());
+
+        let mut batch_requests = Vec::with_capacity(self.entries.len());
+        let mut batch_entries =
+            IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
+
+        let mut max_input_length = 0;
+        let mut prefill_tokens: u32 = 0;
+        let mut decode_tokens: u32 = 0;
+
+        // Pop entries starting from the front of the queue
+        while let Some((id, mut entry)) = self.entries.pop_front() {
+            // Filter entries where the response receiver was dropped (== entries where the request
+            // was dropped by the client)
+            if entry.response_tx.is_closed() {
+                metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
+                tracing::debug!("Dropping entry");
+                continue;
+            }
+
+            if self.requires_padding {
+                // We pad to max input length in the Python shards
+                // We need to take these padding tokens into the equation
+                max_input_length = max_input_length.max(entry.request.input_length);
+                prefill_tokens = (batch_requests.len() + 1) as u32 * max_input_length
+            } else {
+                // pad to block size
+                prefill_tokens += ((entry.request.input_length + self.block_size - 1)
+                    / self.block_size)
+                    * self.block_size;
+            }
+
+            if self.requires_padding {
+                decode_tokens += entry.request.stopping_parameters.max_new_tokens;
+            } else {
+                let max_new_tokens = match self.window_size {
+                    None => entry.request.stopping_parameters.max_new_tokens,
+                    Some(window_size) => min(
+                        window_size.saturating_sub(entry.request.input_length),
+                        entry.request.stopping_parameters.max_new_tokens,
+                    ),
+                };
+
+                // pad to block size
+                decode_tokens +=
+                    ((max_new_tokens + self.block_size - 1) / self.block_size) * self.block_size;
+            }
+
+            if prefill_tokens > prefill_token_budget
+                || (prefill_tokens + decode_tokens + self.speculate) > token_budget
+            {
+                // Entry is over budget
+                // Add it back to the front
+                tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
+                self.entries.push_front((id, entry));
+                break;
+            }
+
+            tracing::debug!("Accepting entry");
+            // Create a new span to link the batch back to this entry
+            let entry_batch_span = info_span!(parent: &entry.span, "infer");
+            // Add relationships
+            next_batch_span.follows_from(&entry_batch_span);
+            entry_batch_span.follows_from(&next_batch_span);
+            // Update entry
+            entry.temp_span = Some(entry_batch_span);
+
+            batch_requests.push(Request {
+                id,
+                prefill_logprobs: entry.request.decoder_input_details,
+                inputs: entry.request.inputs.chunks_to_string(),
+                truncate: entry.request.truncate,
+                parameters: Some(NextTokenChooserParameters::from(
+                    entry.request.parameters.clone(),
+                )),
+                stopping_parameters: Some(StoppingCriteriaParameters::from(
+                    entry.request.stopping_parameters.clone(),
+                )),
+                top_n_tokens: entry.request.top_n_tokens,
+            });
+            // Set batch_time
+            entry.batch_time = Some(Instant::now());
+            // Insert in batch_entries IntMap
+            batch_entries.insert(id, entry);
+
+            // Check if max_size
+            if Some(batch_requests.len()) == max_size {
+                break;
+            }
+        }
+
+        // Empty batch
+        if batch_requests.is_empty() {
+            tracing::debug!("Filtered out all entries");
+            return None;
+        }
+
+        // Check if our batch is big enough
+        if let Some(min_size) = min_size {
+            // Batch is too small
+            if batch_requests.len() < min_size {
+                // Add back entries to the queue in the correct order
+                for r in batch_requests.into_iter().rev() {
+                    let id = r.id;
+                    let entry = batch_entries.remove(&id).unwrap();
+                    self.entries.push_front((id, entry));
+                }
+
+                return None;
+            }
+        }
+
+        // Final batch size
+        let size = batch_requests.len() as u32;
+        next_batch_span.record("batch_size", size);
+
+        let batch = Batch {
+            id: self.next_batch_id,
+            requests: batch_requests,
+            size,
+            max_tokens: (prefill_tokens + decode_tokens),
+        };
+        // Increment batch id
+        self.next_batch_id += 1;
+
+        metrics::histogram!("tgi_batch_next_size", batch.size as f64);
+
+        Some((batch_entries, batch, next_batch_span))
+    }
+}
+
+type NextBatch = (IntMap<u64, Entry>, Batch, Span);
+
+#[derive(Debug)]
+enum QueueCommand {
+    Append(Box<Entry>, Span),
+    NextBatch {
+        min_size: Option<usize>,
+        max_size: Option<usize>,
+        prefill_token_budget: u32,
+        token_budget: u32,
+        response_sender: oneshot::Sender<Option<NextBatch>>,
+        span: Span,
+    },
+}
+
+impl From<ValidParameters> for NextTokenChooserParameters {
+    fn from(value: ValidParameters) -> Self {
+        let (grammar, grammar_type) = match value.grammar {
+            None => (String::new(), GrammarType::None),
+
+            Some(grammar) => match grammar {
+                ValidGrammar::Json(grammar_string) => (grammar_string, GrammarType::Json),
+                ValidGrammar::Regex(grammar_string) => (grammar_string, GrammarType::Regex),
+            },
+        };
+
+        Self {
+            temperature: value.temperature,
+            top_k: value.top_k,
+            top_p: value.top_p,
+            typical_p: value.typical_p,
+            do_sample: value.do_sample,
+            seed: value.seed,
+            repetition_penalty: value.repetition_penalty,
+            frequency_penalty: value.frequency_penalty,
+            watermark: value.watermark,
+            grammar,
+            grammar_type: grammar_type.into(),
+        }
+    }
+}
+
+impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
+    fn from(value: ValidStoppingParameters) -> Self {
+        Self {
+            max_new_tokens: value.max_new_tokens,
+            stop_sequences: value.stop_sequences,
+            ignore_eos_token: value.ignore_eos_token,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tracing::info_span;
+
+    fn default_entry() -> (
+        Entry,
+        mpsc::UnboundedReceiver<Result<InferStreamResponse, InferError>>,
+    ) {
+        let (response_tx, receiver_tx) = mpsc::unbounded_channel();
+
+        let entry = Entry {
+            request: ValidGenerateRequest {
+                inputs: vec![],
+                input_length: 0,
+                truncate: 0,
+                decoder_input_details: false,
+                parameters: ValidParameters {
+                    temperature: 0.0,
+                    top_k: 0,
+                    top_p: 0.0,
+                    typical_p: 0.0,
+                    do_sample: false,
+                    seed: 0,
+                    repetition_penalty: 0.0,
+                    frequency_penalty: 0.0,
+                    watermark: false,
+                    grammar: None,
+                },
+                stopping_parameters: ValidStoppingParameters {
+                    ignore_eos_token: false,
+                    max_new_tokens: 1,
+                    stop_sequences: vec![],
+                },
+                top_n_tokens: 0,
+            },
+            response_tx,
+            span: info_span!("entry"),
+            temp_span: None,
+            queue_time: Instant::now(),
+            batch_time: None,
+        };
+        (entry, receiver_tx)
+    }
+
+    #[test]
+    fn test_append() {
+        let mut state = State::new(false, 1, None, 0);
+        let (entry, _guard) = default_entry();
+
+        assert_eq!(state.next_id, 0);
+        assert_eq!(state.entries.len(), 0);
+
+        state.append(entry);
+
+        assert_eq!(state.next_id, 1);
+        assert_eq!(state.entries.len(), 1);
+        let (id, _) = state.entries.remove(0).unwrap();
+        assert_eq!(id, 0);
+    }
+
+    #[test]
+    fn test_next_batch_empty() {
+        let mut state = State::new(false, 1, None, 0);
+
+        assert!(state.next_batch(None, None, 1, 1).is_none());
+        assert!(state.next_batch(Some(1), None, 1, 1).is_none());
+    }
+
+    #[test]
+    fn test_next_batch_min_size() {
+        let mut state = State::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, None, 2, 2).unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert!(entries.get(&1).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 0);
+        assert_eq!(state.next_batch_id, 1);
+
+        let (entry3, _guard3) = default_entry();
+        state.append(entry3);
+
+        assert!(state.next_batch(Some(2), None, 2, 2).is_none());
+
+        assert_eq!(state.next_id, 3);
+        assert_eq!(state.entries.len(), 1);
+        let (id, _) = state.entries.remove(0).unwrap();
+        assert_eq!(id, 2);
+    }
+
+    #[test]
+    fn test_next_batch_max_size() {
+        let mut state = State::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, Some(1), 2, 2).unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 1);
+        assert_eq!(state.next_batch_id, 1);
+    }
+
+    #[test]
+    fn test_next_batch_token_budget() {
+        let mut state = State::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        state.append(entry1);
+        state.append(entry2);
+
+        let (entries, batch, _) = state.next_batch(None, None, 1, 1).unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        assert_eq!(state.next_id, 2);
+        assert_eq!(state.entries.len(), 1);
+        assert_eq!(state.next_batch_id, 1);
+
+        let (entry3, _guard3) = default_entry();
+        state.append(entry3);
+
+        let (entries, batch, _) = state.next_batch(None, None, 3, 3).unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&1));
+        assert!(entries.contains_key(&2));
+        assert_eq!(batch.id, 1);
+        assert_eq!(batch.size, 2);
+
+        assert_eq!(state.next_id, 3);
+        assert_eq!(state.entries.len(), 0);
+        assert_eq!(state.next_batch_id, 2);
+    }
+
+    #[tokio::test]
+    async fn test_queue_append() {
+        let queue = Queue::new(false, 1, None, 0);
+        let (entry, _guard) = default_entry();
+        queue.append(entry);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_empty() {
+        let queue = Queue::new(false, 1, None, 0);
+
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+        assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_min_size() {
+        let queue = Queue::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert!(entries.get(&1).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+
+        let (entry3, _guard3) = default_entry();
+        queue.append(entry3);
+
+        // Not enough requests pending
+        assert!(queue.next_batch(Some(2), None, 2, 2).await.is_none());
+        // Not enough token budget
+        assert!(queue.next_batch(Some(1), None, 0, 0).await.is_none());
+        // Ok
+        let (entries2, batch2, _) = queue.next_batch(Some(1), None, 2, 2).await.unwrap();
+        assert_eq!(entries2.len(), 1);
+        assert!(entries2.contains_key(&2));
+        assert!(entries2.get(&2).unwrap().batch_time.is_some());
+        assert_eq!(batch2.id, 1);
+        assert_eq!(batch2.size, 1);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_max_size() {
+        let queue = Queue::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, Some(1), 2, 2).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert!(entries.get(&0).unwrap().batch_time.is_some());
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_token_budget() {
+        let queue = Queue::new(false, 1, None, 0);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 1, 1).await.unwrap();
+        assert_eq!(entries.len(), 1);
+        assert!(entries.contains_key(&0));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 1);
+
+        let (entry3, _guard3) = default_entry();
+        queue.append(entry3);
+
+        let (entries, batch, _) = queue.next_batch(None, None, 3, 3).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&1));
+        assert!(entries.contains_key(&2));
+        assert_eq!(batch.id, 1);
+        assert_eq!(batch.size, 2);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_token_speculate() {
+        let queue = Queue::new(false, 1, None, 2);
+        let (entry1, _guard1) = default_entry();
+        let (entry2, _guard2) = default_entry();
+        queue.append(entry1);
+        queue.append(entry2);
+
+        // Budget of 1 is not enough
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+
+        let (entries, batch, _) = queue.next_batch(None, None, 6, 6).await.unwrap();
+        assert_eq!(entries.len(), 2);
+        assert!(entries.contains_key(&0));
+        assert!(entries.contains_key(&1));
+        assert_eq!(batch.id, 0);
+        assert_eq!(batch.size, 2);
+    }
+
+    #[tokio::test]
+    async fn test_queue_next_batch_dropped_receiver() {
+        let queue = Queue::new(false, 1, None, 0);
+        let (entry, _) = default_entry();
+        queue.append(entry);
+
+        assert!(queue.next_batch(None, None, 1, 1).await.is_none());
+    }
+}
--- a/router/src/infer/v2/scheduler.rs
+++ b/router/src/infer/v2/scheduler.rs
@ -1,79 +1,46 @@
 /// Batching and inference logic
-use crate::validation::{Validation, ValidationError};
-use crate::{
-    ChatTemplateInputs, ChatTemplateVersions, Entry, GenerateRequest, GenerateStreamResponse,
-    HubProcessorConfig, HubTokenizerConfig, Message, MessageChunk, PrefillToken, Queue, Text,
-    TextMessage, Token,
+use crate::infer::v2::queue::{Entry, Queue};
+use crate::infer::{
+    GenerateStreamResponse, GeneratedText, InferError, InferStreamResponse, Scheduler,
 };
-use crate::{FunctionRef, FunctionsMap, GrammarType, Properties, Tool, ToolType, Tools};
-use futures::future::try_join_all;
-use minijinja::{Environment, ErrorKind, Template};
+use crate::validation::ValidGenerateRequest;
+use crate::{FinishReason, PrefillToken, Token};
 use nohash_hasher::IntMap;
-use serde_json::{json, Map, Value};
-use std::collections::HashMap;
 use std::sync::{
    atomic::{AtomicBool, Ordering},
    Arc,
 };
-use text_generation_client::{
-    Batch, CachedBatch, ClientError, GeneratedText, Generation, ShardedClient, Tokens,
-};
-use thiserror::Error;
+use text_generation_client::v2::{Batch, CachedBatch, Generation, ShardedClient};
+use text_generation_client::ClientError;
 use tokio::sync::mpsc::error::SendError;
-use tokio::sync::{mpsc, Notify, Semaphore, TryAcquireError};
+use tokio::sync::{mpsc, Notify, OwnedSemaphorePermit};
 use tokio::time::Instant;
 use tokio_stream::wrappers::UnboundedReceiverStream;
-use tokio_stream::StreamExt;
 use tracing::{info_span, instrument, Instrument, Span};

-/// Inference struct
-#[derive(Clone)]
-pub struct Infer {
-    /// Validation
-    validation: Validation,
+pub(crate) struct SchedulerV2 {
    /// Request queue
    queue: Queue,
-    /// Shared state
-    shared: Arc<Shared>,
-    /// Chat template
-    chat_template: Option<ChatTemplate>,
-    /// Inference limit
-    limit_concurrent_requests: Arc<Semaphore>,
+    /// Notify batcher on queue appends
+    batching_task_notifier: Arc<Notify>,
 }

-/// Infer shared state
-struct Shared {
-    /// Batching background Tokio task notifier
-    batching_task: Notify,
-}
-
-/// Raise a exception (custom function) used in the chat templates
-fn raise_exception(err_text: String) -> Result<String, minijinja::Error> {
-    Err(minijinja::Error::new(ErrorKind::SyntaxError, err_text))
-}
-
-impl Infer {
+impl SchedulerV2 {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        client: ShardedClient,
-        validation: Validation,
        waiting_served_ratio: f32,
        max_batch_prefill_tokens: u32,
        max_batch_total_tokens: u32,
        max_waiting_tokens: usize,
        max_batch_size: Option<usize>,
-        max_concurrent_requests: usize,
        requires_padding: bool,
        window_size: Option<u32>,
        speculate: u32,
        generation_health: Arc<AtomicBool>,
-        tokenizer_config: HubTokenizerConfig,
-        processor_config: HubProcessorConfig,
    ) -> Self {
        let queue = Queue::new(requires_padding, 16, window_size, speculate);
-        let shared = Arc::new(Shared {
-            batching_task: Notify::new(),
-        });
+        let batching_task_notifier = Arc::new(Notify::new());

        // Spawn batching background task that contains all the inference logic
        tokio::spawn(batching_task(
@ -84,72 +51,31 @@ impl Infer {
            max_waiting_tokens,
            max_batch_size,
            queue.clone(),
-            shared.clone(),
+            batching_task_notifier.clone(),
            generation_health,
        ));

-        let chat_template = tokenizer_config
-            .chat_template
-            .or(processor_config.chat_template)
-            .and_then(|t| match t {
-                ChatTemplateVersions::Single(template) => Some(template),
-                ChatTemplateVersions::Multiple(templates) => templates
-                    .into_iter()
-                    .find(|t| t.name == "default")
-                    .map(|t| t.template),
-            })
-            .map(|t| {
-                // .strip() is not supported in minijinja
-                // .capitalize() is not supported in minijinja but we can use | capitalize
-                let t = t
-                    .replace(".strip()", " | trim")
-                    .replace(".capitalize()", " | capitalize");
-                ChatTemplate::new(t, tokenizer_config.bos_token, tokenizer_config.eos_token)
-            });
-
-        // Inference limit with a semaphore
-        let semaphore = Arc::new(Semaphore::new(max_concurrent_requests));
-
        Self {
-            validation,
            queue,
-            shared,
-            chat_template,
-            limit_concurrent_requests: semaphore,
+            batching_task_notifier,
+        }
    }
 }

-    /// Add a new request to the queue and return a stream of InferStreamResponse
+impl Scheduler for SchedulerV2 {
    #[instrument(skip_all)]
-    pub(crate) async fn generate_stream(
+    fn schedule(
        &self,
-        request: GenerateRequest,
+        request: ValidGenerateRequest,
+        permit: OwnedSemaphorePermit,
    ) -> Result<GenerateStreamResponse, InferError> {
-        // Limit concurrent requests by acquiring a permit from the semaphore
-        let permit = self
-            .clone()
-            .limit_concurrent_requests
-            .try_acquire_owned()
-            .map_err(|err| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
-                tracing::error!("{err}");
-                err
-            })?;
-
-        // Validate request
-        let valid_request = self.validation.validate(request).await.map_err(|err| {
-            metrics::increment_counter!("tgi_request_failure", "err" => "validation");
-            tracing::error!("{err}");
-            err
-        })?;
-
        // MPSC channel to communicate with the background batching task
        let (response_tx, response_rx) = mpsc::unbounded_channel();
-        let input_length = valid_request.input_length;
+        let input_length = request.input_length;

        // Append the request to the queue
        self.queue.append(Entry {
-            request: valid_request,
+            request,
            response_tx,
            span: Span::current(),
            temp_span: None,
@ -159,7 +85,7 @@ impl Infer {

        // Notify the background task that we have a new entry in the queue that needs
        // to be batched
-        self.shared.batching_task.notify_one();
+        self.batching_task_notifier.notify_one();

        // Return stream
        Ok((
@ -168,343 +94,6 @@ impl Infer {
            UnboundedReceiverStream::new(response_rx),
        ))
    }
-
-    /// Tokenizer the input
-    #[instrument(skip_all)]
-    pub(crate) async fn tokenize(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<Option<tokenizers::Encoding>, InferError> {
-        // Tokenize request
-        let inputs = request.inputs;
-        let truncate = request.parameters.truncate;
-        let encoding = self
-            .validation
-            .tokenize(inputs, truncate)
-            .await
-            .map_err(|err| {
-                tracing::error!("Tokenization {err}");
-                err
-            })?;
-
-        // Return Encoding
-        Ok(encoding.map(|(encoding, _)| encoding))
-    }
-
-    /// Apply the chat template to the chat request
-    #[instrument(skip_all)]
-    pub(crate) fn apply_chat_template(
-        &self,
-        messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
-    ) -> Result<String, InferError> {
-        self.chat_template
-            .as_ref()
-            .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
-            .apply(messages, grammar_with_prompt)
-            .map_err(|e| {
-                metrics::increment_counter!("tgi_request_failure", "err" => "template");
-                tracing::error!("{e}");
-                e
-            })
-    }
-
-    /// Add a new request to the queue and return a InferResponse
-    #[instrument(skip_all)]
-    pub(crate) async fn generate(
-        &self,
-        request: GenerateRequest,
-    ) -> Result<InferResponse, InferError> {
-        let use_top_tokens = request.parameters.top_n_tokens.is_some_and(|x| x > 0);
-
-        // Create stream and keep semaphore permit as long as generate lives
-        let (_permit, _input_length, mut stream) = self.generate_stream(request).await?;
-
-        // Return values
-        let mut result_prefill = Vec::new();
-        let mut result_tokens = Vec::new();
-        let mut result_top_tokens = Vec::new();
-        let mut result_generated_text = None;
-        let mut result_start = None;
-        let mut result_queued = None;
-
-        // Iterate on stream
-        while let Some(response) = stream.next().await {
-            match response? {
-                // Add prefill tokens
-                InferStreamResponse::Prefill(tokens) => {
-                    // Create Token objects
-                    // We do that here instead of in the Python code as Rust for loops are faster
-                    result_prefill = tokens
-                        .ids
-                        .into_iter()
-                        .zip(tokens.logprobs.into_iter())
-                        .zip(tokens.texts.into_iter())
-                        .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
-                        .collect();
-                }
-                // Push last token
-                InferStreamResponse::Intermediate { token, top_tokens } => {
-                    result_tokens.push(token);
-                    result_top_tokens.push(top_tokens);
-                }
-                // Final message
-                // Set return values
-                InferStreamResponse::End {
-                    token,
-                    generated_text,
-                    start,
-                    queued,
-                    top_tokens,
-                } => {
-                    result_tokens.push(token);
-                    result_top_tokens.push(top_tokens);
-                    result_generated_text = Some(generated_text);
-                    result_start = Some(start);
-                    result_queued = Some(queued)
-                }
-            }
-        }
-
-        // Check that we received a `InferStreamResponse::End` message
-        if let (Some(generated_text), Some(queued), Some(start)) =
-            (result_generated_text, result_queued, result_start)
-        {
-            Ok(InferResponse {
-                prefill: result_prefill,
-                _input_length,
-                tokens: result_tokens,
-                generated_text,
-                queued,
-                start,
-                top_tokens: if use_top_tokens {
-                    result_top_tokens
-                } else {
-                    Vec::new()
-                },
-            })
-        } else {
-            let err = InferError::IncompleteGeneration;
-            metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
-            tracing::error!("{err}");
-            Err(err)
-        }
-    }
-    /// Add best_of new requests to the queue and return a InferResponse of the sequence with
-    /// the highest log probability per token
-    #[instrument(skip(self, request))]
-    pub(crate) async fn generate_best_of(
-        &self,
-        request: GenerateRequest,
-        best_of: usize,
-    ) -> Result<(InferResponse, Vec<InferResponse>), InferError> {
-        // validate  best_of parameter separately
-        let best_of = self.validation.validate_best_of(best_of)?;
-
-        // create multiple generate requests
-        let mut infer_responses: Vec<InferResponse> =
-            try_join_all((0..best_of).map(|_| self.generate(request.clone()))).await?;
-
-        // get the sequence with the highest log probability per token
-        let mut max_index = 0;
-        let mut max_logprob: f32 = f32::MIN;
-
-        for (i, response) in infer_responses.iter().enumerate() {
-            // mean logprobs of the generated tokens
-            let sequence_logprob = response
-                .tokens
-                .iter()
-                .map(|token| token.logprob)
-                .sum::<f32>()
-                / response.tokens.len() as f32;
-
-            // set best sequence
-            if sequence_logprob > max_logprob {
-                max_index = i;
-                max_logprob = sequence_logprob;
-            }
-        }
-        let best_response = infer_responses.remove(max_index);
-        Ok((best_response, infer_responses))
-    }
-}
-
-#[derive(Clone)]
-struct ChatTemplate {
-    template: Template<'static, 'static>,
-    bos_token: Option<String>,
-    eos_token: Option<String>,
-    use_default_tool_template: bool,
-}
-
-impl ChatTemplate {
-    fn new(template: String, bos_token: Option<String>, eos_token: Option<String>) -> Self {
-        let mut env = Box::new(Environment::new());
-        let template_str = template.into_boxed_str();
-        env.add_function("raise_exception", raise_exception);
-
-        // check if contains the tools variable within the template
-        let use_default_tool_template =
-            !template_str.as_ref().replace(' ', "").contains("{{tools}}");
-        // leaking env and template_str as read-only, static resources for performance.
-        let template = Box::leak(env)
-            .template_from_str(Box::leak(template_str))
-            .unwrap();
-
-        Self {
-            template,
-            bos_token,
-            eos_token,
-            use_default_tool_template,
-        }
-    }
-
-    fn apply(
-        &self,
-        mut messages: Vec<Message>,
-        grammar_with_prompt: Option<(GrammarType, String)>,
-    ) -> Result<String, InferError> {
-        if self.use_default_tool_template {
-            if let Some(last_message) = messages.last_mut() {
-                if let Some((GrammarType::Json(tools), tool_prompt)) = grammar_with_prompt {
-                    last_message.content.push(MessageChunk::Text(Text {
-                        text: format!("\n---\n{}\n{}", tool_prompt, tools),
-                    }));
-                }
-            }
-        }
-
-        let messages: Vec<TextMessage> = messages.into_iter().map(|c| c.into()).collect();
-
-        self.template
-            .render(ChatTemplateInputs {
-                messages,
-                bos_token: self.bos_token.as_deref(),
-                eos_token: self.eos_token.as_deref(),
-                add_generation_prompt: true,
-                tools: None,
-                tools_prompt: None,
-            })
-            .map_err(InferError::TemplateError)
-    }
-}
-
-pub struct ToolGrammar {}
-
-impl ToolGrammar {
-    pub fn apply(
-        tools: Option<Vec<Tool>>,
-        tool_choice: Option<ToolType>,
-    ) -> Result<Option<Tools>, InferError> {
-        if let Some((req_tools, tool_choice)) = tools.zip(tool_choice) {
-            // let tool_prompt = tool_prompt.unwrap_or_default();
-            let tools_to_use = match tool_choice {
-                ToolType::FunctionName(name) => {
-                    vec![req_tools
-                        .iter()
-                        .find(|tool| tool.function.name == *name)
-                        .unwrap_or_else(|| panic!("Tool with name {} not found", name))
-                        .clone()]
-                }
-                ToolType::OneOf => req_tools.to_owned(),
-            };
-
-            // adds the error notification function for LLM feedback if required
-            let mut text_response_properties = Map::new();
-            text_response_properties.insert(
-                "error".to_string(),
-                serde_json::json!({
-                    "type": "string",
-                    "description": "The error or issue to notify"
-                }),
-            );
-            text_response_properties.insert(
-                "_name".to_string(),
-                serde_json::json!({
-                    "type": "string",
-                    "const": "notify_error"
-                }),
-            );
-
-            let functions: HashMap<String, serde_json::Value> = tools_to_use
-                .iter()
-                .map(|tool| {
-                    let func = tool.function.clone();
-
-                    // Clone the existing parameters, which are expected to be a JSON object
-                    let mut params = if let Value::Object(params) = &func.arguments {
-                        params.clone()
-                    } else {
-                        Map::new()
-                    };
-
-                    // Insert the function's description at the top level, outside of properties
-                    params.insert(
-                        "description".to_string(),
-                        Value::String(func.description.clone().unwrap_or_default()),
-                    );
-
-                    // Ensure 'properties' exists and is an object
-                    let properties = params
-                        .entry("properties".to_string())
-                        .or_insert_with(|| json!({}))
-                        .as_object_mut()
-                        .unwrap();
-
-                    // Insert the constant for the function name inside 'properties'
-                    properties.insert(
-                        "_name".to_string(),
-                        json!({
-                            "type": "string",
-                            "const": func.name.clone(),
-                            // "description": "The name of the function"
-                        }),
-                    );
-
-                    // Check if 'required' exists, and it is an array. If not, create an empty array.
-                    let required = params
-                        .entry("required".to_string())
-                        .or_insert_with(|| json!([]))
-                        .as_array_mut()
-                        .unwrap();
-
-                    // Add 'name' to the 'required' array if it is not already present
-                    if !required.iter().any(|r| r == "_name") {
-                        required.push(json!("_name"));
-                    }
-
-                    (func.name, Value::Object(params))
-                })
-                .chain([(
-                    "notify_error".to_string(),
-                    serde_json::json!({
-                        "properties": text_response_properties,
-                        "required": ["error", "_name"],
-                        "type": "object"
-                    }),
-                )])
-                .collect();
-
-            let tools = Tools {
-                functions_map: FunctionsMap { functions },
-                properties: Properties {
-                    function: tools_to_use
-                        .iter()
-                        .map(|tool| FunctionRef {
-                            ref_path: format!("#/$functions/{}", tool.function.name.clone()),
-                        })
-                        .chain(std::iter::once(FunctionRef {
-                            ref_path: "#/$functions/notify_error".to_string(),
-                        }))
-                        .collect(),
-                },
-            };
-
-            return Ok(Some(tools));
-        }
-        // Err(InferError::ToolError("No tools provided".to_string()))
-        Ok(None)
-    }
 }

 /// Batching logic
@ -512,7 +101,7 @@ impl ToolGrammar {
 ///
 /// Batches requests and sends them to the inference server
 #[allow(clippy::too_many_arguments)]
-async fn batching_task(
+pub(crate) async fn batching_task(
    mut client: ShardedClient,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
@ -520,13 +109,13 @@ async fn batching_task(
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
    queue: Queue,
-    shared: Arc<Shared>,
+    notifier: Arc<Notify>,
    generation_health: Arc<AtomicBool>,
 ) {
    // Infinite loop
    loop {
        // Wait for a notification from the Infer struct
-        shared.batching_task.notified().await;
+        notifier.notified().await;

        // Get the next batch from the queue
        // This batch might be smaller than the maximum batch size if there are not enough requests
@ -792,6 +381,16 @@ fn send_responses(
    let mut stopped = false;

    if let Some(prefill_tokens) = generation.prefill_tokens {
+        // Create Token objects
+        // We do that here instead of in the Python code as Rust for loops are faster
+        let prefill_tokens = prefill_tokens
+            .ids
+            .into_iter()
+            .zip(prefill_tokens.logprobs)
+            .zip(prefill_tokens.texts)
+            .map(|((id, logprob), text)| PrefillToken { id, text, logprob })
+            .collect();
+
        // Send message
        entry
            .response_tx
@ -842,7 +441,7 @@ fn send_responses(
                entry.response_tx.send(Ok(InferStreamResponse::End {
                    token,
                    top_tokens,
-                    generated_text: generated_text.clone(),
+                    generated_text: GeneratedText::from(generated_text.clone()),
                    queued: entry.queue_time,
                    start: entry.batch_time.unwrap(),
                }))?;
@ -877,64 +476,21 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
    });
 }

-#[derive(Debug)]
-pub(crate) enum InferStreamResponse {
-    // Optional first message
-    Prefill(Tokens),
-    // Intermediate messages
-    Intermediate {
-        token: Token,
-        top_tokens: Vec<Token>,
-    },
-    // Last message
-    End {
-        token: Token,
-        top_tokens: Vec<Token>,
-        generated_text: GeneratedText,
-        start: Instant,
-        queued: Instant,
-    },
-}
+impl From<text_generation_client::v2::GeneratedText> for GeneratedText {
+    fn from(value: text_generation_client::v2::GeneratedText) -> Self {
+        let v2_finish_reason =
+            text_generation_client::v2::FinishReason::try_from(value.finish_reason).unwrap();
+        let finish_reason = match v2_finish_reason {
+            text_generation_client::v2::FinishReason::Length => FinishReason::Length,
+            text_generation_client::v2::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            text_generation_client::v2::FinishReason::StopSequence => FinishReason::StopSequence,
+        };

-#[derive(Debug)]
-pub(crate) struct InferResponse {
-    /// input_length is the input as perceived by the rust tokenizer in the
-    /// validation pathway. It is redundant with prefill.len() but prefill
-    /// has data only if the user asked for it. This will always be filled.
-    pub(crate) _input_length: u32,
-    pub(crate) prefill: Vec<PrefillToken>,
-    pub(crate) tokens: Vec<Token>,
-    pub(crate) generated_text: GeneratedText,
-    pub(crate) queued: Instant,
-    pub(crate) start: Instant,
-    pub(crate) top_tokens: Vec<Vec<Token>>,
-}
-
-#[derive(Debug, Error)]
-pub enum InferError {
-    #[error("Request failed during generation: {0}")]
-    GenerationError(String),
-    #[error("Model is overloaded")]
-    Overloaded(#[from] TryAcquireError),
-    #[error("Input validation error: {0}")]
-    ValidationError(#[from] ValidationError),
-    #[error("Incomplete generation")]
-    IncompleteGeneration,
-    #[error("Template error: {0}")]
-    TemplateError(#[from] minijinja::Error),
-    #[error("Tool error: {0}")]
-    ToolError(String),
-}
-
-impl InferError {
-    pub(crate) fn error_type(&self) -> &str {
-        match self {
-            InferError::GenerationError(_) => "generation",
-            InferError::Overloaded(_) => "overloaded",
-            InferError::ValidationError(_) => "validation",
-            InferError::IncompleteGeneration => "incomplete_generation",
-            InferError::TemplateError(_) => "template_error",
-            InferError::ToolError(_) => "tool_error",
+        Self {
+            text: value.text,
+            generated_tokens: value.generated_tokens,
+            finish_reason,
+            seed: value.seed,
        }
    }
 }
--- a/router/src/infer/v3/mod.rs
+++ b/router/src/infer/v3/mod.rs
@ -0,0 +1,4 @@
+mod queue;
+mod scheduler;
+
+pub(crate) use scheduler::SchedulerV3;
--- a/router/src/infer/v3/queue.rs
+++ b/router/src/infer/v3/queue.rs
@ -1,12 +1,14 @@
-use crate::infer::InferError;
-use crate::infer::InferStreamResponse;
-use crate::validation::ValidGenerateRequest;
+use crate::infer::{InferError, InferStreamResponse};
+use crate::validation::{
+    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
+};
 use nohash_hasher::{BuildNoHashHasher, IntMap};
 use std::cmp::min;
 use std::collections::VecDeque;
-use text_generation_client::ChunksToString;
-use text_generation_client::Input;
-use text_generation_client::{Batch, Request};
+use text_generation_client::v3::{
+    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
+};
+use text_generation_client::{ChunksToString, Input};
 use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Span};
@ -57,7 +59,6 @@ impl Queue {
        Self { queue_sender }
    }

-    /// Append an entry to the queue
    #[instrument(skip_all)]
    pub(crate) fn append(&self, entry: Entry) {
        // Send append command to the background task managing the state
@ -280,13 +281,17 @@ impl State {
            batch_requests.push(Request {
                id,
                prefill_logprobs: entry.request.decoder_input_details,
+                inputs: entry.request.inputs.chunks_to_string(),
                input_chunks: Some(Input {
                    chunks: entry.request.inputs.clone(),
                }),
-                inputs: entry.request.inputs.chunks_to_string(),
                truncate: entry.request.truncate,
-                parameters: Some(entry.request.parameters.clone()),
-                stopping_parameters: Some(entry.request.stopping_parameters.clone()),
+                parameters: Some(NextTokenChooserParameters::from(
+                    entry.request.parameters.clone(),
+                )),
+                stopping_parameters: Some(StoppingCriteriaParameters::from(
+                    entry.request.stopping_parameters.clone(),
+                )),
                top_n_tokens: entry.request.top_n_tokens,
            });
            // Set batch_time
@ -355,12 +360,46 @@ enum QueueCommand {
    },
 }

+impl From<ValidParameters> for NextTokenChooserParameters {
+    fn from(value: ValidParameters) -> Self {
+        let (grammar, grammar_type) = match value.grammar {
+            None => (String::new(), GrammarType::None),
+
+            Some(grammar) => match grammar {
+                ValidGrammar::Json(grammar_string) => (grammar_string, GrammarType::Json),
+                ValidGrammar::Regex(grammar_string) => (grammar_string, GrammarType::Regex),
+            },
+        };
+
+        Self {
+            temperature: value.temperature,
+            top_k: value.top_k,
+            top_p: value.top_p,
+            typical_p: value.typical_p,
+            do_sample: value.do_sample,
+            seed: value.seed,
+            repetition_penalty: value.repetition_penalty,
+            frequency_penalty: value.frequency_penalty,
+            watermark: value.watermark,
+            grammar,
+            grammar_type: grammar_type.into(),
+        }
+    }
+}
+
+impl From<ValidStoppingParameters> for StoppingCriteriaParameters {
+    fn from(value: ValidStoppingParameters) -> Self {
+        Self {
+            max_new_tokens: value.max_new_tokens,
+            stop_sequences: value.stop_sequences,
+            ignore_eos_token: value.ignore_eos_token,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use text_generation_client::{
-        GrammarType as ProtoGrammarType, NextTokenChooserParameters, StoppingCriteriaParameters,
-    };
    use tracing::info_span;

    fn default_entry() -> (
@ -375,7 +414,7 @@ mod tests {
                input_length: 0,
                truncate: 0,
                decoder_input_details: false,
-                parameters: NextTokenChooserParameters {
+                parameters: ValidParameters {
                    temperature: 0.0,
                    top_k: 0,
                    top_p: 0.0,
@ -385,10 +424,9 @@ mod tests {
                    repetition_penalty: 0.0,
                    frequency_penalty: 0.0,
                    watermark: false,
-                    grammar: String::new(),
-                    grammar_type: ProtoGrammarType::None as i32,
+                    grammar: None,
                },
-                stopping_parameters: StoppingCriteriaParameters {
+                stopping_parameters: ValidStoppingParameters {
                    ignore_eos_token: false,
                    max_new_tokens: 1,
                    stop_sequences: vec![],
--- a/router/src/infer/v3/scheduler.rs
+++ b/router/src/infer/v3/scheduler.rs
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -1,27 +1,14 @@
-pub mod config;
-mod health;
 /// Text Generation Inference Webserver
+pub mod config;
 mod infer;
-mod queue;
 pub mod server;
 mod validation;

-use infer::{Infer, InferError, InferStreamResponse};
-use queue::{Entry, Queue};
 use serde::{Deserialize, Serialize};
-use tokio::sync::OwnedSemaphorePermit;
-use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;

-/// Type alias for generation responses
-pub(crate) type GenerateStreamResponse = (
-    OwnedSemaphorePermit,
-    u32, // input_length
-    UnboundedReceiverStream<Result<InferStreamResponse, InferError>>,
-);
-
 #[derive(Clone, Deserialize, ToSchema)]
 pub(crate) struct VertexInstance {
    #[schema(example = "What is Deep Learning?")]
@ -158,7 +145,7 @@ pub struct Info {
    #[schema(example = "4")]
    pub max_stop_sequences: usize,
    #[schema(example = "1024")]
-    pub max_input_length: usize,
+    pub max_input_tokens: usize,
    #[schema(example = "2048")]
    pub max_total_tokens: usize,
    #[schema(example = "1.2")]
@ -1087,7 +1074,7 @@ pub struct SimpleToken {
    stop: usize,
 }

-#[derive(Serialize, ToSchema)]
+#[derive(Debug, Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 #[schema(example = "Length")]
 pub(crate) enum FinishReason {
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -12,7 +12,6 @@ use std::fs::File;
 use std::io::BufReader;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::{Path, PathBuf};
-use text_generation_client::{ClientError, ShardedClient};
 use text_generation_router::config::Config;
 use text_generation_router::{server, HubModelInfo, HubProcessorConfig, HubTokenizerConfig};
 use thiserror::Error;
@ -315,59 +314,6 @@ async fn main() -> Result<(), RouterError> {
        Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
    };

-    // Instantiate sharded client from the master unix socket
-    let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
-        .await
-        .map_err(RouterError::Connection)?;
-    // Clear the cache; useful if the webserver rebooted
-    sharded_client
-        .clear_cache(None)
-        .await
-        .map_err(RouterError::Cache)?;
-    // Get info from the shard
-    let shard_info = sharded_client.info().await.map_err(RouterError::Info)?;
-
-    // Warmup model
-    tracing::info!("Warming up model");
-    let max_supported_batch_total_tokens = match sharded_client
-        .warmup(
-            max_input_tokens as u32,
-            max_batch_prefill_tokens,
-            max_total_tokens as u32,
-            max_batch_size,
-        )
-        .await
-        .map_err(RouterError::Warmup)?
-    {
-        // Older models do not support automatic max-batch-total-tokens
-        None => {
-            let max_batch_total_tokens = max_batch_total_tokens
-                .unwrap_or(16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)));
-            tracing::warn!("Model does not support automatic max batch total tokens");
-            max_batch_total_tokens
-        }
-        // Flash attention models return their max supported total tokens
-        Some(max_supported_batch_total_tokens) => {
-            // Warn if user added his own max-batch-total-tokens as we will ignore it
-            if max_batch_total_tokens.is_some() {
-                tracing::warn!(
-                    "`--max-batch-total-tokens` is deprecated for Flash \
-                        Attention models."
-                );
-                tracing::warn!(
-                    "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
-                );
-            }
-            if max_total_tokens as u32 > max_supported_batch_total_tokens {
-                return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_supported_batch_total_tokens}")));
-            }
-
-            max_supported_batch_total_tokens
-        }
-    };
-    tracing::info!("Setting max batch total tokens to {max_supported_batch_total_tokens}");
-    tracing::info!("Connected");
-
    // Determine the server port based on the feature and environment variable.
    let port = if cfg!(feature = "google") {
        std::env::var("AIP_HTTP_PORT")
@ -387,8 +333,8 @@ async fn main() -> Result<(), RouterError> {

    // Run server
    server::run(
+        master_shard_uds_path,
        model_info,
-        shard_info,
        compat_return_full_text,
        max_concurrent_requests,
        max_best_of,
@ -398,10 +344,9 @@ async fn main() -> Result<(), RouterError> {
        max_total_tokens,
        waiting_served_ratio,
        max_batch_prefill_tokens,
-        max_supported_batch_total_tokens,
+        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
-        sharded_client,
        tokenizer,
        config,
        validation_workers,
@ -557,16 +502,8 @@ pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConf
 enum RouterError {
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
-    #[error("Unable to connect to the Python model shards: {0}")]
-    Connection(ClientError),
-    #[error("Unable to clear the Python model shards cache: {0}")]
-    Cache(ClientError),
-    #[error("Unable to get the Python model shards info: {0}")]
-    Info(ClientError),
-    #[error("Unable to warmup the Python model shards: {0}")]
-    Warmup(ClientError),
+    #[error("WebServer error: {0}")]
+    WebServer(#[from] server::WebServerError),
    #[error("Tokio runtime failed to start: {0}")]
    Tokio(#[from] std::io::Error),
-    #[error("Axum webserver failed: {0}")]
-    Axum(#[from] axum::BoxError),
 }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1,13 +1,15 @@
-use crate::config::Config;
 /// HTTP Server logic
-use crate::health::Health;
-use crate::infer::{InferError, InferResponse, InferStreamResponse, ToolGrammar};
+use crate::config::Config;
+use crate::infer::v2::SchedulerV2;
+use crate::infer::v3::SchedulerV3;
+use crate::infer::{HealthCheck, Scheduler};
+use crate::infer::{Infer, InferError, InferResponse, InferStreamResponse, ToolGrammar};
 use crate::validation::ValidationError;
 use crate::{
    BestOfSequence, Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest,
-    GenerateResponse, GrammarType, HubModelInfo, HubProcessorConfig, HubTokenizerConfig, Infer,
-    Info, Message, PrefillToken, SimpleToken, StreamDetails, StreamResponse, Token,
-    TokenizeResponse, Usage, Validation,
+    GenerateResponse, GrammarType, HubModelInfo, HubProcessorConfig, HubTokenizerConfig, Info,
+    Message, PrefillToken, SimpleToken, StreamDetails, StreamResponse, Token, TokenizeResponse,
+    Usage, Validation,
 };
 use crate::{
    ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
@ -34,7 +36,8 @@ use std::convert::Infallible;
 use std::net::SocketAddr;
 use std::sync::atomic::AtomicBool;
 use std::sync::Arc;
-use text_generation_client::{ShardInfo, ShardedClient};
+use text_generation_client::{v2, v3, ClientError, ShardInfo};
+use thiserror::Error;
 use tokenizers::Tokenizer;
 use tokio::select;
 use tokio::signal;
@ -115,7 +118,9 @@ example = json ! ({"error": "unhealthy", "error_type": "healthcheck"})),
 )]
 #[instrument(skip(health))]
 /// Health check method
-async fn health(mut health: Extension<Health>) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
+async fn health(
+    mut health: Extension<HealthCheck>,
+) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
    match health.check().await {
        true => Ok(()),
        false => Err((
@ -213,9 +218,7 @@ async fn generate_internal(

                        BestOfSequence {
                            generated_text: output_text,
-                            finish_reason: FinishReason::from(
-                                response.generated_text.finish_reason,
-                            ),
+                            finish_reason: response.generated_text.finish_reason,
                            generated_tokens: response.generated_text.generated_tokens,
                            prefill: response.prefill,
                            tokens: response.tokens,
@ -227,7 +230,7 @@ async fn generate_internal(
            });

            Some(Details {
-                finish_reason: FinishReason::from(response.generated_text.finish_reason),
+                finish_reason: response.generated_text.finish_reason,
                generated_tokens: response.generated_text.generated_tokens,
                prefill: response.prefill,
                tokens: response.tokens,
@ -468,7 +471,7 @@ async fn generate_stream_internal(
                                        // Token details
                                        let details = match details {
                                            true => Some(StreamDetails {
-                                                finish_reason: FinishReason::from(generated_text.finish_reason),
+                                                finish_reason: generated_text.finish_reason,
                                                generated_tokens: generated_text.generated_tokens,
                                                seed: generated_text.seed,
                                            }),
@ -1372,21 +1375,20 @@ pub(crate) struct ComputeType(String);
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
+    master_shard_uds_path: String,
    model_info: HubModelInfo,
-    shard_info: ShardInfo,
    compat_return_full_text: bool,
    max_concurrent_requests: usize,
    max_best_of: usize,
    max_stop_sequences: usize,
    max_top_n_tokens: u32,
-    max_input_length: usize,
+    max_input_tokens: usize,
    max_total_tokens: usize,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
-    max_batch_total_tokens: u32,
+    max_batch_total_tokens: Option<u32>,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
-    client: ShardedClient,
    tokenizer: Option<Tokenizer>,
    config: Option<Config>,
    validation_workers: usize,
@ -1400,7 +1402,7 @@ pub async fn run(
    messages_api_enabled: bool,
    grammar_support: bool,
    max_client_batch_size: usize,
-) -> Result<(), axum::BoxError> {
+) -> Result<(), WebServerError> {
    // OpenAPI documentation
    #[derive(OpenApi)]
    #[openapi(
@ -1470,6 +1472,141 @@ pub async fn run(
    struct ApiDoc;

    // Create state
+
+    // Open connection, get model info and warmup
+    let (scheduler, health_ext, shard_info, max_batch_total_tokens): (
+        Arc<dyn Scheduler + Send + Sync>,
+        HealthCheck,
+        ShardInfo,
+        u32,
+    ) = {
+        // Helper function to check both v2 and v3
+        let check_max_batch_total_tokens = |max_supported_batch_total_tokens: Option<u32>| {
+            match max_supported_batch_total_tokens {
+                // Older models do not support automatic max-batch-total-tokens
+                None => {
+                    let max_batch_total_tokens = max_batch_total_tokens.unwrap_or(
+                        16000.max((max_total_tokens as u32).max(max_batch_prefill_tokens)),
+                    );
+                    tracing::warn!("Model does not support automatic max batch total tokens");
+                    Ok(max_batch_total_tokens)
+                }
+                // Flash attention models return their max supported total tokens
+                Some(max_supported_batch_total_tokens) => {
+                    // Warn if user added his own max-batch-total-tokens as we will ignore it
+                    if max_batch_total_tokens.is_some() {
+                        tracing::warn!(
+                            "`--max-batch-total-tokens` is deprecated for Flash \
+                        Attention models."
+                        );
+                        tracing::warn!(
+                            "Inferred max batch total tokens: {max_supported_batch_total_tokens}"
+                        );
+                    }
+                    if max_total_tokens as u32 > max_supported_batch_total_tokens {
+                        return Err(WebServerError::NotEnoughMemory(max_total_tokens));
+                    }
+
+                    Ok(max_supported_batch_total_tokens)
+                }
+            }
+        };
+
+        let generation_health = Arc::new(AtomicBool::new(false));
+
+        match v3::ShardedClient::connect_uds(master_shard_uds_path.clone()).await {
+            Ok(mut sharded_client) => {
+                // server is running on v3
+                // Clear the cache; useful if the webserver rebooted
+                sharded_client
+                    .clear_cache(None)
+                    .await
+                    .map_err(WebServerError::Cache)?;
+                // Get info from the shard
+                let shard_info = sharded_client.info().await.map_err(WebServerError::Info)?;
+
+                // Warmup model
+                tracing::info!("Warming up model");
+                let max_batch_total_tokens = check_max_batch_total_tokens(
+                    sharded_client
+                        .warmup(
+                            max_input_tokens as u32,
+                            max_batch_prefill_tokens,
+                            max_total_tokens as u32,
+                            max_batch_size,
+                        )
+                        .await
+                        .map_err(WebServerError::Warmup)?,
+                )?;
+
+                let health_ext =
+                    HealthCheck::new(Arc::new(sharded_client.clone()), generation_health.clone());
+                let scheduler = Arc::new(SchedulerV3::new(
+                    sharded_client,
+                    waiting_served_ratio,
+                    max_batch_prefill_tokens,
+                    max_batch_total_tokens,
+                    max_waiting_tokens,
+                    max_batch_size,
+                    shard_info.requires_padding,
+                    shard_info.window_size,
+                    shard_info.speculate,
+                    generation_health,
+                ));
+                tracing::info!("Using scheduler V3");
+
+                (scheduler, health_ext, shard_info, max_batch_total_tokens)
+            }
+            Err(_) => {
+                let mut sharded_client = v2::ShardedClient::connect_uds(master_shard_uds_path)
+                    .await
+                    .map_err(WebServerError::Connection)?;
+
+                // server is running on v2
+                // Clear the cache; useful if the webserver rebooted
+                sharded_client
+                    .clear_cache(None)
+                    .await
+                    .map_err(WebServerError::Cache)?;
+                // Get info from the shard
+                let shard_info = sharded_client.info().await.map_err(WebServerError::Info)?;
+
+                // Warmup model
+                tracing::info!("Warming up model");
+                let max_batch_total_tokens = check_max_batch_total_tokens(
+                    sharded_client
+                        .warmup(
+                            max_input_tokens as u32,
+                            max_batch_prefill_tokens,
+                            max_total_tokens as u32,
+                            max_batch_size,
+                        )
+                        .await
+                        .map_err(WebServerError::Warmup)?,
+                )?;
+
+                let health_ext =
+                    HealthCheck::new(Arc::new(sharded_client.clone()), generation_health.clone());
+                let scheduler = Arc::new(SchedulerV2::new(
+                    sharded_client,
+                    waiting_served_ratio,
+                    max_batch_prefill_tokens,
+                    max_batch_total_tokens,
+                    max_waiting_tokens,
+                    max_batch_size,
+                    shard_info.requires_padding,
+                    shard_info.window_size,
+                    shard_info.speculate,
+                    generation_health,
+                ));
+                tracing::info!("Using scheduler V2");
+
+                (scheduler, health_ext, shard_info, max_batch_total_tokens)
+            }
+        }
+    };
+    tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
+
    let validation = Validation::new(
        validation_workers,
        tokenizer,
@ -1477,25 +1614,15 @@ pub async fn run(
        max_best_of,
        max_stop_sequences,
        max_top_n_tokens,
-        max_input_length,
+        max_input_tokens,
        max_total_tokens,
        grammar_support,
    );
-    let generation_health = Arc::new(AtomicBool::new(false));
-    let health_ext = Health::new(client.clone(), generation_health.clone());
+
    let infer = Infer::new(
-        client,
+        scheduler,
        validation,
-        waiting_served_ratio,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        max_waiting_tokens,
-        max_batch_size,
        max_concurrent_requests,
-        shard_info.requires_padding,
-        shard_info.window_size,
-        shard_info.speculate,
-        generation_health,
        tokenizer_config,
        processor_config,
    );
@ -1514,7 +1641,7 @@ pub async fn run(
    // Input Length buckets
    let input_length_matcher = Matcher::Full(String::from("tgi_request_input_length"));
    let input_length_buckets: Vec<f64> = (0..100)
-        .map(|x| (max_input_length as f64 / 100.0) * (x + 1) as f64)
+        .map(|x| (max_input_tokens as f64 / 100.0) * (x + 1) as f64)
        .collect();
    // Generated tokens buckets
    let generated_tokens_matcher = Matcher::Full(String::from("tgi_request_generated_tokens"));
@ -1568,7 +1695,7 @@ pub async fn run(
        max_concurrent_requests,
        max_best_of,
        max_stop_sequences,
-        max_input_length,
+        max_input_tokens,
        max_total_tokens,
        waiting_served_ratio,
        max_batch_total_tokens,
@ -1664,6 +1791,8 @@ pub async fn run(
        .layer(OtelAxumLayer::default())
        .layer(cors_layer);

+    tracing::info!("Connected");
+
    if ngrok {
        #[cfg(feature = "ngrok")]
        {
@ -1686,7 +1815,8 @@ pub async fn run(
        let listener = tokio::net::TcpListener::bind(&addr).await.unwrap();
        axum::serve(listener, app)
            .with_graceful_shutdown(shutdown_signal())
-            .await?;
+            .await
+            .map_err(|err| WebServerError::Axum(Box::new(err)))?;
    }
    Ok(())
 }
@ -1719,17 +1849,6 @@ async fn shutdown_signal() {
    opentelemetry::global::shutdown_tracer_provider();
 }

-impl From<i32> for FinishReason {
-    fn from(finish_reason: i32) -> Self {
-        let finish_reason = text_generation_client::FinishReason::try_from(finish_reason).unwrap();
-        match finish_reason {
-            text_generation_client::FinishReason::Length => FinishReason::Length,
-            text_generation_client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
-            text_generation_client::FinishReason::StopSequence => FinishReason::StopSequence,
-        }
-    }
-}
-
 /// Convert to Axum supported formats
 impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
    fn from(err: InferError) -> Self {
@ -1762,3 +1881,19 @@ impl From<InferError> for Event {
            .unwrap()
    }
 }
+
+#[derive(Debug, Error)]
+pub enum WebServerError {
+    #[error("Unable to connect to the Python model shards: {0}")]
+    Connection(ClientError),
+    #[error("Unable to clear the Python model shards cache: {0}")]
+    Cache(ClientError),
+    #[error("Unable to get the Python model shards info: {0}")]
+    Info(ClientError),
+    #[error("Unable to warmup the Python model shards: {0}")]
+    Warmup(ClientError),
+    #[error("Not enough memory to handle `max_total_tokens={0}`")]
+    NotEnoughMemory(usize),
+    #[error("Axum error: {0}")]
+    Axum(#[from] axum::BoxError),
+}
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -1,20 +1,16 @@
-use crate::config::Config;
 /// Payload validation logic
+use crate::config::Config;
 use crate::validation::ValidationError::{BestOfSampling, BestOfSeed, EmptyInput};
 use crate::{GenerateParameters, GenerateRequest, GrammarType};
+use base64::{engine::general_purpose::STANDARD, Engine};
+use image::{io::Reader as ImageReader, ImageFormat};
 use jsonschema::{Draft, JSONSchema};
 use rand::{thread_rng, Rng};
 use serde_json::Value;
 use std::io::Cursor;
-use text_generation_client::{
-    Chunk, GrammarType as ProtoGrammarType, Image, InputChunk, NextTokenChooserParameters,
-    StoppingCriteriaParameters,
-};
+use text_generation_client::{Chunk, Image, InputChunk};
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
-// use tokenizers::TruncationDirection;
-use base64::{engine::general_purpose::STANDARD, Engine};
-use image::{io::Reader as ImageReader, ImageFormat};
 use tokio::sync::mpsc;
 use tokio::sync::oneshot;
 use tracing::{instrument, Span};
@ -173,10 +169,6 @@ impl Validation {
            // Validate MaxNewTokens
            if (input_length as u32 + max_new_tokens) > self.max_total_tokens as u32 {
                input_length = input_length.saturating_sub(max_new_tokens as usize);
-                // return Err(ValidationError::MaxNewTokens(
-                //     self.max_total_tokens - self.max_input_length,
-                //     max_new_tokens,
-                // ));
            }

            Ok((
@ -327,13 +319,13 @@ impl Validation {
        // compiler and use that to build the FSM here.

        // Validate grammar and unpack the grammar and type for the proto message
-        let (grammar, grammar_type) = match grammar {
+        let grammar = match grammar {
            Some(grammar) => {
                // Ensure that grammar is not set if it's not supported
                if self.disable_grammar_support {
                    return Err(ValidationError::Grammar);
                }
-                match grammar {
+                let valid_grammar = match grammar {
                    GrammarType::Json(json) => {
                        let json = match json {
                            // if value is a string, we need to parse it again to make sure its
@ -350,20 +342,20 @@ impl Validation {
                            .compile(&json)
                            .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?;

-                        (
                        // Serialize json to string
+                        ValidGrammar::Json(
                            serde_json::to_string(&json)
                                .map_err(|e| ValidationError::InvalidGrammar(e.to_string()))?,
-                            ProtoGrammarType::Json.into(),
                        )
                    }
-                    GrammarType::Regex(regex) => (regex, ProtoGrammarType::Regex.into()),
+                    GrammarType::Regex(regex) => ValidGrammar::Regex(regex),
+                };
+                Some(valid_grammar)
            }
-            }
-            None => (String::new(), ProtoGrammarType::None.into()),
+            None => None,
        };

-        let parameters = NextTokenChooserParameters {
+        let parameters = ValidParameters {
            temperature,
            repetition_penalty,
            frequency_penalty,
@ -374,9 +366,8 @@ impl Validation {
            seed,
            watermark,
            grammar,
-            grammar_type,
        };
-        let stopping_parameters = StoppingCriteriaParameters {
+        let stopping_parameters = ValidStoppingParameters {
            max_new_tokens,
            stop_sequences,
            ignore_eos_token: false,
@ -458,6 +449,7 @@ fn format_from_mimetype(mimetype: &str) -> Option<ImageFormat> {
        _ => None,
    }
 }
+
 fn format_to_mimetype(format: ImageFormat) -> String {
    match format {
        ImageFormat::Png => "image/png",
@ -636,14 +628,55 @@ type TokenizerRequest = (
    Span,
 );

+#[derive(Debug, Clone)]
+pub(crate) enum ValidGrammar {
+    Json(String),
+    Regex(String),
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct ValidParameters {
+    /// / exponential scaling output probability distribution
+    pub temperature: f32,
+    /// / restricting to the k highest probability elements
+    pub top_k: u32,
+    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    pub top_p: f32,
+    /// / restricting to top tokens summing to prob_cut_off <= prob_cut_off
+    pub typical_p: f32,
+    /// / apply sampling on the logits
+    pub do_sample: bool,
+    /// / random seed for sampling
+    pub seed: u64,
+    /// / repetition penalty
+    pub repetition_penalty: f32,
+    /// / frequency penalty
+    pub frequency_penalty: f32,
+    /// / token watermarking using "A Watermark for Large Language Models"
+    pub watermark: bool,
+    /// / grammar (applied if not empty)
+    pub grammar: Option<ValidGrammar>,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct ValidStoppingParameters {
+    /// / Maximum number of generated tokens
+    pub max_new_tokens: u32,
+    /// / Optional stopping sequences
+    pub stop_sequences: Vec<String>,
+    /// / Ignore end of sequence token
+    /// / used for benchmarking
+    pub ignore_eos_token: bool,
+}
+
 #[derive(Debug, Clone)]
 pub(crate) struct ValidGenerateRequest {
    pub inputs: Vec<InputChunk>,
    pub input_length: u32,
    pub truncate: u32,
    pub decoder_input_details: bool,
-    pub parameters: NextTokenChooserParameters,
-    pub stopping_parameters: StoppingCriteriaParameters,
+    pub parameters: ValidParameters,
+    pub stopping_parameters: ValidStoppingParameters,
    pub top_n_tokens: u32,
 }

--- a/server/Makefile
+++ b/server/Makefile
@ -12,8 +12,8 @@ gen-server:
 	# Compile protos
 	pip install grpcio-tools==1.51.1 mypy-protobuf==3.4.0 'types-protobuf>=3.20.4' --no-cache-dir
 	mkdir text_generation_server/pb || true
-	python -m grpc_tools.protoc -I../proto --python_out=text_generation_server/pb \
-		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/generate.proto
+	python -m grpc_tools.protoc -I../proto/v3 --python_out=text_generation_server/pb \
+		--grpc_python_out=text_generation_server/pb --mypy_out=text_generation_server/pb ../proto/v3/generate.proto
 	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
 	touch text_generation_server/pb/__init__.py