diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index 05a26370..a47e62dc 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -122,7 +122,7 @@ impl Backend for BackendV3 { #[allow(clippy::too_many_arguments)] pub(crate) async fn batching_task( mut client: ShardedClient, - waiting_served_ratio: f32, + _waiting_served_ratio: f32, max_batch_prefill_tokens: u32, max_batch_total_tokens: u32, max_waiting_tokens: usize, @@ -168,7 +168,10 @@ pub(crate) async fn batching_task( None } else { // Minimum batch size - Some((batch_size as f32 * waiting_served_ratio).floor() as usize) + // TODO: temporarily disable to avoid incorrect deallocation + + // reallocation when using prefix caching. + // Some((batch_size as f32 * waiting_served_ratio).floor() as usize) + None }; let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);