refacto

2024-06-26 14:00:03 +02:00 · 2024-06-26 14:00:03 +02:00 · 93e0a7de8b
parent b562680be4
commit 93e0a7de8b
28 changed files with 254 additions and 392 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -194,17 +194,6 @@ dependencies = [
 "v_frame",
 ]
 [[package]]
 name = "average"
 version = "0.14.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c309b1c7fca12ebeec3ecba29ea917b3a4cb458ccf504df68bb4d8a0ca565a00"
 dependencies = [
 "easy-cast",
 "float-ord",
 "num-traits",
 ]
 [[package]]
 name = "avif-serialize"
 version = "0.8.1"
@ -503,12 +492,6 @@ dependencies = [
 "thiserror",
 ]
 [[package]]
 name = "cassowary"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
 [[package]]
 name = "cc"
 version = "1.0.98"
@ -570,7 +553,7 @@ version = "4.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64"
 dependencies = [
- "heck 0.5.0",
+ "heck",
 "proc-macro2",
 "quote",
 "syn 2.0.66",
@ -675,31 +658,6 @@ version = "0.8.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 [[package]]
 name = "crossterm"
 version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df"
 dependencies = [
 "bitflags 2.5.0",
 "crossterm_winapi",
 "libc",
 "mio",
 "parking_lot",
 "signal-hook",
 "signal-hook-mio",
 "winapi",
 ]
 [[package]]
 name = "crossterm_winapi"
 version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
 dependencies = [
 "winapi",
 ]
 [[package]]
 name = "crunchy"
 version = "0.2.2"
@ -832,15 +790,6 @@ dependencies = [
 "windows-sys 0.48.0",
 ]
 [[package]]
 name = "easy-cast"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6"
 dependencies = [
 "libm",
 ]
 [[package]]
 name = "either"
 version = "1.12.0"
@ -944,12 +893,6 @@ dependencies = [
 "miniz_oxide",
 ]
 [[package]]
 name = "float-ord"
 version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ce81f49ae8a0482e4c55ea62ebbd7e5a686af544c00b9d090bba3ff9be97b3d"
 [[package]]
 name = "float_eq"
 version = "1.0.1"
@ -1208,12 +1151,6 @@ version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 [[package]]
 name = "heck"
 version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
 [[package]]
 name = "heck"
 version = "0.5.0"
@ -1498,12 +1435,6 @@ dependencies = [
 "unicode-width",
 ]
 [[package]]
 name = "indoc"
 version = "2.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
 [[package]]
 name = "init-tracing-opentelemetry"
 version = "0.14.1"
@ -1674,12 +1605,6 @@ dependencies = [
 "once_cell",
 ]
 [[package]]
 name = "libm"
 version = "0.2.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 [[package]]
 name = "libredox"
 version = "0.1.3"
@ -1896,7 +1821,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
 "libc",
 "log",
 "wasi",
 "windows-sys 0.48.0",
 ]
@ -2148,7 +2072,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
 "autocfg",
 "libm",
 ]
 [[package]]
@ -2412,17 +2335,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "papergrid"
 version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2ccbe15f2b6db62f9a9871642746427e297b0ceb85f9a7f1ee5ff47d184d0c8"
 dependencies = [
 "bytecount",
 "fnv",
 "unicode-width",
 ]
 [[package]]
 name = "parking_lot"
 version = "0.12.3"
@ -2626,7 +2538,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck 0.5.0",
+ "heck",
 "itertools 0.12.1",
 "log",
 "multimap",
@ -2745,23 +2657,6 @@ dependencies = [
 "getrandom",
 ]
 [[package]]
 name = "ratatui"
 version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2e2e4cd95294a85c3b4446e63ef054eea43e0205b1fd60120c16b74ff7ff96ad"
 dependencies = [
 "bitflags 2.5.0",
 "cassowary",
 "crossterm",
 "indoc",
 "itertools 0.11.0",
 "paste",
 "strum",
 "unicode-segmentation",
 "unicode-width",
 ]
 [[package]]
 name = "rav1e"
 version = "0.7.1"
@ -3269,27 +3164,6 @@ dependencies = [
 "lazy_static",
 ]
 [[package]]
 name = "signal-hook"
 version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801"
 dependencies = [
 "libc",
 "signal-hook-registry",
 ]
 [[package]]
 name = "signal-hook-mio"
 version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
 dependencies = [
 "libc",
 "mio",
 "signal-hook",
 ]
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.2"
@ -3387,28 +3261,6 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 [[package]]
 name = "strum"
 version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
 dependencies = [
 "strum_macros",
 ]
 [[package]]
 name = "strum_macros"
 version = "0.25.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
 dependencies = [
 "heck 0.4.1",
 "proc-macro2",
 "quote",
 "rustversion",
 "syn 2.0.66",
 ]
 [[package]]
 name = "subtle"
 version = "2.5.0"
@ -3491,36 +3343,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e535eb8dded36d55ec13eddacd30dec501792ff23a0b1682c38601b8cf2349"
 dependencies = [
 "cfg-expr",
- "heck 0.5.0",
+ "heck",
 "pkg-config",
 "toml",
 "version-compare",
 ]
 [[package]]
 name = "tabled"
 version = "0.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dfe9c3632da101aba5131ed63f9eed38665f8b3c68703a6bb18124835c1a5d22"
 dependencies = [
 "papergrid",
 "tabled_derive",
 "unicode-width",
 ]
 [[package]]
 name = "tabled_derive"
 version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
 dependencies = [
 "heck 0.4.1",
 "proc-macro-error",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
 ]
 [[package]]
 name = "target-lexicon"
 version = "0.12.14"
@ -3539,45 +3367,6 @@ dependencies = [
 "windows-sys 0.52.0",
 ]
 [[package]]
 name = "text-generation-benchmark"
 version = "2.0.5-dev0"
 dependencies = [
 "average",
 "clap",
 "crossterm",
 "float-ord",
 "hf-hub",
 "ratatui",
 "serde",
 "serde_json",
 "tabled",
 "text-generation-client",
 "thiserror",
 "tokenizers",
 "tokio",
 "tracing",
 "tracing-subscriber",
 ]
 [[package]]
 name = "text-generation-client"
 version = "2.0.5-dev0"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
 "futures",
 "grpc-metadata",
 "prost 0.12.6",
 "prost-build",
 "thiserror",
 "tokio",
 "tonic 0.10.2",
 "tonic-build",
 "tower",
 "tracing",
 ]
 [[package]]
 name = "text-generation-launcher"
 version = "2.0.5-dev0"
@ -3627,7 +3416,6 @@ dependencies = [
 "reqwest",
 "serde",
 "serde_json",
 "text-generation-client",
 "thiserror",
 "tokenizers",
 "tokio",
@ -3641,6 +3429,54 @@ dependencies = [
 "vergen",
 ]
 [[package]]
 name = "text-generation-router-v3"
 version = "2.0.5-dev0"
 dependencies = [
 "async-stream",
 "async-trait",
 "axum 0.7.5",
 "axum-tracing-opentelemetry",
 "base64 0.22.1",
 "clap",
 "futures",
 "futures-util",
 "grpc-metadata",
 "hf-hub",
 "image",
 "init-tracing-opentelemetry",
 "jsonschema",
 "metrics",
 "metrics-exporter-prometheus",
 "minijinja",
 "minijinja-contrib",
 "nohash-hasher",
 "once_cell",
 "opentelemetry 0.20.0",
 "opentelemetry-otlp",
 "prost 0.12.6",
 "prost-build",
 "rand",
 "regex",
 "reqwest",
 "serde",
 "serde_json",
 "text-generation-router",
 "thiserror",
 "tokenizers",
 "tokio",
 "tokio-stream",
 "tonic 0.10.2",
 "tonic-build",
 "tower",
 "tower-http",
 "tracing",
 "tracing-opentelemetry 0.21.0",
 "tracing-subscriber",
 "utoipa",
 "utoipa-swagger-ui",
 ]
 [[package]]
 name = "thiserror"
 version = "1.0.61"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,9 +1,9 @@
 [workspace]
 members = [
-    "benchmark",
+#    "benchmark",
-    "router",
+    "backends/v3",
-    "router/client",
+#    "backends/client",
-    "router/grpc-metadata",
+    "backends/grpc-metadata",
    "launcher"
 ]
 resolver = "2"
--- a/backends/client/Cargo.toml
+++ b/backends/client/Cargo.toml
--- a/backends/client/src/v2/client.rs
+++ b/backends/client/src/v2/client.rs
--- a/backends/client/src/v2/mod.rs
+++ b/backends/client/src/v2/mod.rs
--- a/backends/client/src/v2/sharded_client.rs
+++ b/backends/client/src/v2/sharded_client.rs
--- a/backends/grpc-metadata/Cargo.toml
+++ b/backends/grpc-metadata/Cargo.toml
--- a/backends/grpc-metadata/src/lib.rs
+++ b/backends/grpc-metadata/src/lib.rs
--- a/backends/v3/Cargo.toml
+++ b/backends/v3/Cargo.toml
@ -0,0 +1,66 @@
 [package]
 name = "text-generation-router-v3"
 description = "Text Generation Webserver"
 version.workspace = true
 edition.workspace = true
 authors.workspace = true
 homepage.workspace = true
 [lib]
 path = "src/lib.rs"
 [[bin]]
 name = "text-generation-router"
 path = "src/main.rs"
 [dependencies]
 async-trait = "0.1.74"
 async-stream = "0.3.5"
 axum = { version = "0.7", features = ["json"] }
 axum-tracing-opentelemetry = "0.16"
 text-generation-router = { path = "../../router" }
 clap = { version = "4.4.5", features = ["derive", "env"] }
 grpc-metadata = { path = "../grpc-metadata" }
 futures = "0.3.28"
 hf-hub = { workspace = true }
 jsonschema = { version = "0.17.1", features = ["draft202012"] }
 metrics = "0.21.1"
 metrics-exporter-prometheus = { version = "0.12.1", features = [] }
 nohash-hasher = "0.2.0"
 opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
 opentelemetry-otlp = "0.13.0"
 rand = "0.8.5"
 reqwest = { version = "0.11.20", features = [] }
 serde = "1.0.188"
 serde_json = "1.0.107"
 thiserror = "1.0.48"
 tokenizers = { workspace = true}
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.14"
 tower-http = { version = "0.5.1", features = ["cors"] }
 tracing = "0.1.37"
 tracing-opentelemetry = "0.21.0"
 tracing-subscriber = { version = "0.3.17", features = ["json", "env-filter"] }
 utoipa = { version = "4.2.0", features = ["axum_extras"] }
 utoipa-swagger-ui = { version = "6.0.0", features = ["axum"] }
 init-tracing-opentelemetry = { version = "0.14.1", features = ["opentelemetry-otlp"] }
 minijinja = { version = "2.0.2" }
 minijinja-contrib = { version = "2.0.2", features = ["pycompat"] }
 futures-util = "0.3.30"
 regex = "1.10.3"
 once_cell = "1.19.0"
 image = "0.25.1"
 base64 = { workspace = true }
 prost = "^0.12"
 tonic = "^0.10"
 tower = "^0.4"
 [build-dependencies]
 tonic-build = "0.10.1"
 prost-build = "0.12.1"
 [features]
 default = ["ngrok"]
 ngrok = ["text-generation-router/ngrok"]
 google = ["text-generation-router/google"]
 kserve = ["text-generation-router/kserve"]
--- a/backends/v3/build.rs
+++ b/backends/v3/build.rs
@ -0,0 +1,19 @@
 use std::fs;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("cargo:rerun-if-changed=../../proto/");
    fs::create_dir_all("src/client/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");
    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/client/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
    Ok(())
 }
--- a/router/src/infer/v3/backend.rs
+++ b/router/src/infer/v3/backend.rs
@ -1,16 +1,15 @@
 /// Batching and inference logic
-use crate::infer::v3::queue::{Entry, Queue};
+use crate::queue::{Entry, Queue};
-use crate::infer::{
+use text_generation_router::infer::{
    GeneratedText, InferError, InferStreamResponse, Backend,
 };
-use crate::validation::ValidGenerateRequest;
+use text_generation_router::validation::ValidGenerateRequest;
-use crate::{FinishReason, PrefillToken, Token};
+use text_generation_router::{FinishReason, PrefillToken, Token};
 use nohash_hasher::IntMap;
 use std::sync::{
    Arc,
 };
-use text_generation_client::v3::{Batch, CachedBatch, Generation, ShardedClient};
+use crate::client::{Batch, CachedBatch, Generation, ShardedClient, ClientError, Health};
 use text_generation_client::{ClientError, Health};
 use tokio::sync::mpsc::error::SendError;
 use tokio::sync::{mpsc, Notify};
 use tokio::time::Instant;
@ -18,7 +17,7 @@ use tokio_stream::wrappers::UnboundedReceiverStream;
 use tracing::{info_span, instrument, Instrument, Span};
 use async_trait::async_trait;
-pub(crate) struct BackendV3 {
+pub struct BackendV3 {
    /// Request queue
    queue: Queue,
    /// Notify batcher on queue appends
@ -78,7 +77,6 @@ impl Backend for BackendV3 {
    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
        // MPSC channel to communicate with the background batching task
        let (response_tx, response_rx) = mpsc::unbounded_channel();
        let input_length = request.input_length;
        // Append the request to the queue
        self.queue.append(Entry {
@ -480,14 +478,14 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
    });
 }
-impl From<text_generation_client::v3::GeneratedText> for GeneratedText {
+impl From<crate::client::GeneratedText> for GeneratedText {
-    fn from(value: text_generation_client::v3::GeneratedText) -> Self {
+    fn from(value: crate::client::GeneratedText) -> Self {
        let v3_finish_reason =
-            text_generation_client::v3::FinishReason::try_from(value.finish_reason).unwrap();
+            crate::client::FinishReason::try_from(value.finish_reason).unwrap();
        let finish_reason = match v3_finish_reason {
-            text_generation_client::v3::FinishReason::Length => FinishReason::Length,
+            crate::client::FinishReason::Length => FinishReason::Length,
-            text_generation_client::v3::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
+            crate::client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
-            text_generation_client::v3::FinishReason::StopSequence => FinishReason::StopSequence,
+            crate::client::FinishReason::StopSequence => FinishReason::StopSequence,
        };
        Self {
--- a/router/src/infer/v3/block_allocator.rs
+++ b/router/src/infer/v3/block_allocator.rs
--- a/backends/v3/src/client/client.rs
+++ b/backends/v3/src/client/client.rs
@ -1,6 +1,7 @@
 use crate::v3::{pb, Chunk};
 use crate::{ClientError, Result, WARMUP_IMAGE_BASE64};
 /// Single shard Client
 use crate::client::{pb, Chunk};
 use crate::client::{ClientError, Result, WARMUP_IMAGE_BASE64};
 use base64::engine::general_purpose::STANDARD;
 use base64::Engine;
 use grpc_metadata::InjectTelemetryContext;
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@ -1,15 +1,24 @@
 //! Text Generation gRPC client library
 pub mod v2;
 pub mod v3;
 use async_trait::async_trait;
 use base64::{engine::general_purpose::STANDARD, Engine};
 use thiserror::Error;
 use tonic::transport;
 use tonic::Status;
-pub use v3::{Chunk, Image, Input, InputChunk};
+#[allow(clippy::derive_partial_eq_without_eq)]
 mod pb;
 mod client;
 mod sharded_client;
 pub use client::Client;
 pub use pb::generate::v3::{
    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
    StoppingCriteriaParameters,
 };
 pub use sharded_client::ShardedClient;
 #[async_trait]
 pub trait Health {
@ -63,29 +72,6 @@ impl From<Chunk> for InputChunk {
    }
 }
 /// Convert input chunks to a stringly-typed input for backwards
 /// compat for backends that haven't implemented chunked inputs.
 pub trait ChunksToString {
    /// Convert chunks to string.
    fn chunks_to_string(&self) -> String;
 }
 impl ChunksToString for Vec<InputChunk> {
    fn chunks_to_string(&self) -> String {
        let mut output = String::new();
        self.iter().for_each(|c| match &c.chunk {
            Some(Chunk::Text(text)) => output.push_str(text),
            Some(Chunk::Image(Image { data, mimetype })) => {
                let encoded = STANDARD.encode(data);
                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
            }
            // We don't create empty chunks, so this should be unreachable.
            None => unreachable!("Chunks should never be empty"),
        });
        output
    }
 }
 static WARMUP_IMAGE_BASE64 :&str = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII=";
 pub type Result<T> = std::result::Result<T, ClientError>;
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@ -1,14 +1,14 @@
 /// Multi shard Client
-use crate::{v3, Health, ShardInfo};
+use crate::client::{Health, ShardInfo};
-use crate::{ClientError, Result};
+use crate::client::{ClientError, Result};
-use crate::v3::{Chunk, InfoResponse, Input};
+use crate::client::{Chunk, InfoResponse, Input};
 use async_trait::async_trait;
 use futures::future::join_all;
 use tonic::transport::Uri;
 use tracing::instrument;
-use v3::client::{DecodeTimings, PrefillTimings};
+use crate::client::client::{DecodeTimings, PrefillTimings};
-use v3::{
+use crate::client::{
    Batch, CachedBatch, Client, Generation, GrammarType, HealthResponse,
    NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
--- a/router/src/infer/v3/mod.rs
+++ b/router/src/infer/v3/mod.rs
@ -1,14 +1,13 @@
 mod block_allocator;
 mod queue;
 mod backend;
 mod client;
 use futures_util::TryFutureExt;
 use serde::Serialize;
 use thiserror::Error;
 use utoipa::ToSchema;
 pub(crate) use backend::BackendV3;
-use text_generation_client::ClientError;
+use crate::client::{ShardedClient, ClientError};
 use text_generation_client::v3::ShardedClient;
 #[derive(Clone, Debug, Serialize, ToSchema)]
 pub struct BackendInfo {
@ -127,7 +126,7 @@ pub async fn connect_backend(
 }
 #[derive(Debug, Error)]
-pub(crate) enum V3Error {
+pub enum V3Error {
    #[error("Unable to clear the Python model shards cache: {0}")]
    Cache(ClientError),
    #[error("Unable to connect to the Python model shards: {0}")]
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@ -20,6 +20,7 @@ use tower_http::cors::AllowOrigin;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
 use text_generation_router_v3::{connect_backend, V3Error};
 /// App Configuration
 #[derive(Parser, Debug)]
@ -336,9 +337,11 @@ async fn main() -> Result<(), RouterError> {
        }
    };
    let (backend, backend_info) = connect_backend(max_input_tokens, max_total_tokens, master_shard_uds_path, waiting_served_ratio, max_batch_prefill_tokens, max_batch_total_tokens, max_waiting_tokens, max_batch_size).await?;
    // Run server
    server::run(
-        master_shard_uds_path,
+        backend,
        model_info,
        compat_return_full_text,
        max_concurrent_requests,
@ -347,11 +350,6 @@ async fn main() -> Result<(), RouterError> {
        max_top_n_tokens,
        max_input_tokens,
        max_total_tokens,
        waiting_served_ratio,
        max_batch_prefill_tokens,
        max_batch_total_tokens,
        max_waiting_tokens,
        max_batch_size,
        tokenizer,
        config,
        validation_workers,
@ -508,6 +506,8 @@ pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConf
 enum RouterError {
    #[error("Argument validation error: {0}")]
    ArgumentValidation(String),
    #[error("Backend failed: {0}")]
    Backend(#[from] V3Error),
    #[error("WebServer error: {0}")]
    WebServer(#[from] server::WebServerError),
    #[error("Tokio runtime failed to start: {0}")]
--- a/router/src/infer/v3/queue.rs
+++ b/router/src/infer/v3/queue.rs
@ -1,17 +1,14 @@
-use crate::infer::v3::block_allocator::{BlockAllocation, BlockAllocator};
+use crate::block_allocator::{BlockAllocation, BlockAllocator};
-use crate::infer::InferError;
+use text_generation_router::infer::InferError;
-use crate::infer::InferStreamResponse;
+use text_generation_router::infer::InferStreamResponse;
-use crate::validation::{
+use text_generation_router::validation::{Chunk, ChunksToString, ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters};
    ValidGenerateRequest, ValidGrammar, ValidParameters, ValidStoppingParameters,
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
 use std::cmp::{max, min};
 use std::collections::VecDeque;
-use text_generation_client::v3::{
+use crate::client::{
    Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
-use text_generation_client::ChunksToString;
+use crate::client as client;
 use text_generation_client::Input;
 use tokio::sync::{mpsc, oneshot};
 use tokio::time::Instant;
 use tracing::{info_span, instrument, Instrument, Span};
@ -337,8 +334,11 @@ impl State {
            batch_requests.push(Request {
                id,
                prefill_logprobs: entry.request.decoder_input_details,
-                input_chunks: Some(Input {
+                input_chunks: Some(client::Input {
-                    chunks: entry.request.inputs.clone(),
+                    chunks: entry.request.inputs.clone().into_iter().map(|c| client::InputChunk { chunk: Some(match c {
                        Chunk::Text(text) => client::Chunk::Text(text),
                        Chunk::Image(image) => client::Chunk::Image(client::Image { data: image.data, mimetype: image.mimetype })
                    })}).collect()
                }),
                inputs: entry.request.inputs.chunks_to_string(),
                truncate: entry.request.truncate,
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -21,7 +21,7 @@ float-ord = "0.3.2"
 serde = {version = "1.0.188", features = ["derive"]}
 serde_json = "1.0"
 tabled = "0.14.0"
-text-generation-client = { path = "../router/client" }
+text-generation-client = { path = "../backends/client" }
 thiserror = "1.0.48"
 tokenizers = { workspace = true }
 tokio = { version = "1.32.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "macros"] }
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -7,19 +7,11 @@ edition.workspace = true
 authors.workspace = true
 homepage.workspace = true
 [lib]
 path = "src/lib.rs"
 [[bin]]
 name = "text-generation-router"
 path = "src/main.rs"
 [dependencies]
 async-trait = "0.1.74"
 async-stream = "0.3.5"
 axum = { version = "0.7", features = ["json"] }
 axum-tracing-opentelemetry = "0.16"
 text-generation-client = { path = "client" }
 clap = { version = "4.4.5", features = ["derive", "env"] }
 futures = "0.3.28"
 hf-hub = { workspace = true }
--- a/router/client/build.rs
+++ b/router/client/build.rs
@ -1,35 +0,0 @@
 use std::fs;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    println!("cargo:rerun-if-changed=../../proto/");
    fs::create_dir_all("src/v2/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");
    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/v2/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/generate.proto"], &["../../proto"])
        .map_err(|e| match e.kind(){
            std::io::ErrorKind::NotFound => {panic!("`protoc` not found, install libprotoc")},
            std::io::ErrorKind::Other => {panic!("`protoc` version unsupported, upgrade protoc: https://github.com/protocolbuffers/protobuf/releases")},
            e => {e}
        }).unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
    fs::create_dir_all("src/v3/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
    config.protoc_arg("--experimental_allow_proto3_optional");
    tonic_build::configure()
        .build_client(true)
        .build_server(false)
        .out_dir("src/v3/pb")
        .include_file("mod.rs")
        .compile_with_config(config, &["../../proto/v3/generate.proto"], &["../../proto"])
        .unwrap_or_else(|e| panic!("protobuf compilation failed: {e}"));
    Ok(())
 }
--- a/router/client/src/v2/pb/.gitignore
+++ b/router/client/src/v2/pb/.gitignore
@ -1 +0,0 @@
 *
--- a/router/client/src/v3/mod.rs
+++ b/router/client/src/v3/mod.rs
@ -1,13 +0,0 @@
 #[allow(clippy::derive_partial_eq_without_eq)]
 mod pb;
 mod client;
 mod sharded_client;
 pub use client::Client;
 pub use pb::generate::v3::{
    input_chunk::Chunk, Batch, CachedBatch, FinishReason, GeneratedText, Generation, GrammarType,
    HealthResponse, Image, InfoResponse, Input, InputChunk, NextTokenChooserParameters, Request,
    StoppingCriteriaParameters, Tokens,
 };
 pub use sharded_client::ShardedClient;
--- a/router/client/src/v3/pb/.gitignore
+++ b/router/client/src/v3/pb/.gitignore
@ -1 +0,0 @@
 *
--- a/router/src/infer/mod.rs
+++ b/router/src/infer/mod.rs
@ -1,5 +1,4 @@
 // pub(crate) mod v2;
 pub(crate) mod v3;
 mod chat_template;
 pub mod tool_grammar;
@ -23,7 +22,7 @@ use chat_template::ChatTemplate;
 use async_trait::async_trait;
 #[async_trait]
-pub(crate) trait Backend {
+pub trait Backend {
    fn schedule(
        &self,
        request: ValidGenerateRequest,
@ -286,15 +285,15 @@ pub(crate) type GenerateStreamResponse = (
 );
 #[derive(Debug)]
-pub(crate) struct GeneratedText {
+pub struct GeneratedText {
-    pub(crate) text: String,
+    pub text: String,
-    pub(crate) generated_tokens: u32,
+    pub generated_tokens: u32,
-    pub(crate) finish_reason: FinishReason,
+    pub finish_reason: FinishReason,
-    pub(crate) seed: Option<u64>,
+    pub seed: Option<u64>,
 }
 #[derive(Debug)]
-pub(crate) enum InferStreamResponse {
+pub enum InferStreamResponse {
    // Optional first message
    Prefill(Vec<PrefillToken>),
    // Intermediate messages
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -1,8 +1,8 @@
 /// Text Generation Inference Webserver
 pub mod config;
-mod infer;
+pub mod infer;
 pub mod server;
-mod validation;
+pub mod validation;
 #[cfg(feature = "kserve")]
 mod kserve;
@ -1055,23 +1055,23 @@ impl From<CompatGenerateRequest> for GenerateRequest {
 #[derive(Debug, Serialize, ToSchema)]
 pub struct PrefillToken {
    #[schema(example = 0)]
-    id: u32,
+    pub id: u32,
    #[schema(example = "test")]
-    text: String,
+    pub text: String,
    #[schema(nullable = true, example = - 0.34)]
-    logprob: f32,
+    pub logprob: f32,
 }
 #[derive(Debug, Serialize, ToSchema, Clone)]
 pub struct Token {
    #[schema(example = 0)]
-    id: u32,
+    pub id: u32,
    #[schema(example = "test")]
-    text: String,
+    pub text: String,
    #[schema(nullable = true, example = - 0.34)]
-    logprob: f32,
+    pub logprob: f32,
    #[schema(example = "false")]
-    special: bool,
+    pub special: bool,
 }
 #[derive(Debug, Serialize, ToSchema)]
@ -1089,7 +1089,7 @@ pub struct SimpleToken {
 #[derive(Debug, Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 #[schema(example = "Length")]
-pub(crate) enum FinishReason {
+pub enum FinishReason {
    #[schema(rename = "length")]
    Length,
    #[serde(rename = "eos_token")]
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1,6 +1,5 @@
 /// HTTP Server logic
 use crate::config::Config;
 use crate::infer::v3::{connect_backend, V3Error};
 use crate::infer::{Infer, InferError, InferResponse, InferStreamResponse, Backend};
 use crate::infer::tool_grammar::ToolGrammar;
 #[cfg(feature = "kserve")]
@ -38,9 +37,6 @@ use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
 use serde_json::Value;
 use std::convert::Infallible;
 use std::net::SocketAddr;
 use std::sync::atomic::AtomicBool;
 use std::sync::Arc;
 use text_generation_client::{v2, v3, ClientError, ShardInfo};
 use thiserror::Error;
 use tokenizers::Tokenizer;
 use tokio::select;
@ -1398,7 +1394,7 @@ pub(crate) struct ComputeType(String);
 /// Serving method
 #[allow(clippy::too_many_arguments)]
 pub async fn run(
-    master_shard_uds_path: String,
+    backend: impl Backend + Send + Sync + 'static,
    model_info: HubModelInfo,
    compat_return_full_text: bool,
    max_concurrent_requests: usize,
@ -1407,11 +1403,6 @@ pub async fn run(
    max_top_n_tokens: u32,
    max_input_tokens: usize,
    max_total_tokens: usize,
    waiting_served_ratio: f32,
    max_batch_prefill_tokens: u32,
    max_batch_total_tokens: Option<u32>,
    max_waiting_tokens: usize,
    max_batch_size: Option<usize>,
    tokenizer: Option<Tokenizer>,
    config: Option<Config>,
    validation_workers: usize,
@ -1495,11 +1486,6 @@ pub async fn run(
    struct ApiDoc;
    // Create state
    // Open connection, get model info and warmup
    let (backend, backend_info) = connect_backend(max_input_tokens, max_total_tokens, master_shard_uds_path, waiting_served_ratio, max_batch_prefill_tokens, max_batch_total_tokens, max_waiting_tokens, max_batch_size).await?;
    // tracing::info!("Setting max batch total tokens to {max_batch_total_tokens}");
    let validation = Validation::new(
        validation_workers,
        tokenizer,
@ -1827,8 +1813,6 @@ impl From<InferError> for Event {
 #[derive(Debug, Error)]
 pub enum WebServerError {
    #[error("Backend error: {0}")]
    Backend(#[from] V3Error),
    #[error("Axum error: {0}")]
    Axum(#[from] axum::BoxError),
 }
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -8,7 +8,6 @@ use jsonschema::{Draft, JSONSchema};
 use rand::{thread_rng, Rng};
 use serde_json::Value;
 use std::io::Cursor;
 use text_generation_client::{Chunk, Image, InputChunk};
 use thiserror::Error;
 use tokenizers::tokenizer::Tokenizer;
 use tokio::sync::mpsc;
@ -86,7 +85,7 @@ impl Validation {
        &self,
        inputs: String,
        truncate: Option<usize>,
-    ) -> Result<Option<(tokenizers::Encoding, Vec<InputChunk>)>, ValidationError> {
+    ) -> Result<Option<(tokenizers::Encoding, Vec<Chunk>)>, ValidationError> {
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
@ -112,7 +111,7 @@ impl Validation {
        inputs: String,
        truncate: Option<usize>,
        max_new_tokens: Option<u32>,
-    ) -> Result<(Vec<InputChunk>, usize, u32), ValidationError> {
+    ) -> Result<(Vec<Chunk>, usize, u32), ValidationError> {
        // If we have a fast tokenizer
        if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
            // Create response channel
@ -514,7 +513,7 @@ fn prepare_input(
    _truncate: Option<usize>,
    tokenizer: &Tokenizer,
    config: &Option<Config>,
-) -> Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError> {
+) -> Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError> {
    static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[\]\([^\)]*\)").unwrap());
    let (tokenizer_query, input_chunks) = match config {
        Some(Config::LlavaNext(config)) => {
@ -626,18 +625,51 @@ fn prepare_input(
 type TokenizerRequest = (
    (String, Option<usize>),
-    oneshot::Sender<Result<(tokenizers::Encoding, Vec<InputChunk>), ValidationError>>,
+    oneshot::Sender<Result<(tokenizers::Encoding, Vec<Chunk>), ValidationError>>,
    Span,
 );
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub struct Image {
    pub data: Vec<u8>,
    pub mimetype: String,
 }
 #[derive(Debug, Clone, Eq, PartialEq)]
 pub enum Chunk {
    Text(String),
    Image(Image)
 }
 /// Convert input chunks to a stringly-typed input for backwards
 /// compat for backends that haven't implemented chunked inputs.
 pub trait ChunksToString {
    /// Convert chunks to string.
    fn chunks_to_string(&self) -> String;
 }
 impl ChunksToString for Vec<Chunk> {
    fn chunks_to_string(&self) -> String {
        let mut output = String::new();
        self.iter().for_each(|c| match &c {
            Chunk::Text(text) => output.push_str(text),
            Chunk::Image(Image { data, mimetype }) => {
                let encoded = STANDARD.encode(data);
                output.push_str(&format!("![](data:{};base64,{})", mimetype, encoded))
            }
        });
        output
    }
 }
 #[derive(Debug, Clone)]
-pub(crate) enum ValidGrammar {
+pub enum ValidGrammar {
    Json(String),
    Regex(String),
 }
 #[derive(Debug, Clone)]
-pub(crate) struct ValidParameters {
+pub struct ValidParameters {
    /// / exponential scaling output probability distribution
    pub temperature: f32,
    /// / restricting to the k highest probability elements
@ -661,7 +693,7 @@ pub(crate) struct ValidParameters {
 }
 #[derive(Debug, Clone)]
-pub(crate) struct ValidStoppingParameters {
+pub struct ValidStoppingParameters {
    /// / Maximum number of generated tokens
    pub max_new_tokens: u32,
    /// / Optional stopping sequences
@ -672,8 +704,8 @@ pub(crate) struct ValidStoppingParameters {
 }
 #[derive(Debug, Clone)]
-pub(crate) struct ValidGenerateRequest {
+pub struct ValidGenerateRequest {
-    pub inputs: Vec<InputChunk>,
+    pub inputs: Vec<Chunk>,
    pub input_length: u32,
    pub truncate: u32,
    pub decoder_input_details: bool,