hotfix: avoid non-prefilled block use when using prefix caching (#2489)

The minimum batch size logic could cause prefix blocks to be
deallocated without prefill. The next allocation of the same
prefix would then use garbage blocks.
This commit is contained in:
Daniël de Kok 2024-09-05 15:09:29 +02:00 committed by GitHub
parent 6cb42f49ae
commit deec30f893
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 5 additions and 2 deletions

View File

@ -122,7 +122,7 @@ impl Backend for BackendV3 {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub(crate) async fn batching_task( pub(crate) async fn batching_task(
mut client: ShardedClient, mut client: ShardedClient,
waiting_served_ratio: f32, _waiting_served_ratio: f32,
max_batch_prefill_tokens: u32, max_batch_prefill_tokens: u32,
max_batch_total_tokens: u32, max_batch_total_tokens: u32,
max_waiting_tokens: usize, max_waiting_tokens: usize,
@ -168,7 +168,10 @@ pub(crate) async fn batching_task(
None None
} else { } else {
// Minimum batch size // Minimum batch size
Some((batch_size as f32 * waiting_served_ratio).floor() as usize) // TODO: temporarily disable to avoid incorrect deallocation +
// reallocation when using prefix caching.
// Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
None
}; };
let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);