From 6d06473cf48e19e7382b27940f993a5f48c83997 Mon Sep 17 00:00:00 2001 From: drbh Date: Fri, 9 Aug 2024 04:54:32 -0400 Subject: [PATCH] Pr 2352 ci branch (#2382) * Fix unsigned integer underflow Passing --max-batch-size to the launcher actually had no effect because after a few requests the max_size passed to State::next_batch would underflow becoming a largo positive number. In the scheduler, as soon as the cached batch size reached the max_batch_size the max_size passed to next_batch becomes 0. Since the only check in that funcion is ``` if Some(batch_requests.len()) == max_size { break; } ``` and it's called after the `batch_requests.len()` has become 1, it doesn't do anything to prevent more than 0 requests from being batched. Now we have cached batch in the server that is large than max_batch_size and `max_size - batch_size as usize` underflows. Signed-off-by: Max de Bayser * fix: update v3 scheduler and ensure max_batch_size > 0 --------- Signed-off-by: Max de Bayser Co-authored-by: Max de Bayser --- backends/v3/src/backend.rs | 3 ++- backends/v3/src/main.rs | 8 ++++++++ backends/v3/src/queue.rs | 7 +++++++ router/src/infer/v2/queue.rs | 7 +++++++ router/src/infer/v2/scheduler.rs | 4 ++-- 5 files changed, 26 insertions(+), 3 deletions(-) diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index d82355de..6b3e0526 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -168,7 +168,8 @@ pub(crate) async fn batching_task( }; let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); - let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize); + let max_size = + max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize)); // Try to get a new batch if let Some((mut new_entries, new_batch, span)) = queue diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs index 21952e66..471ddb5a 100644 --- a/backends/v3/src/main.rs +++ b/backends/v3/src/main.rs @@ -150,6 +150,14 @@ async fn main() -> Result<(), RouterError> { } } + if let Some(max_batch_size) = max_batch_size { + if max_batch_size == 0 { + return Err(RouterError::ArgumentValidation( + "`max_batch_size` must be > 0".to_string(), + )); + } + } + let (backend, _backend_info) = connect_backend( max_input_tokens, max_total_tokens, diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index 9427bd60..b457389c 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -226,6 +226,13 @@ impl State { } } + if let Some(max_size) = max_size { + if max_size == 0 { + tracing::debug!("No capacity"); + return None; + } + } + // Pad prefill_token_budget to be a multiple of block size let prefill_token_budget = ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size; diff --git a/router/src/infer/v2/queue.rs b/router/src/infer/v2/queue.rs index 0b51645a..696cbfc8 100644 --- a/router/src/infer/v2/queue.rs +++ b/router/src/infer/v2/queue.rs @@ -205,6 +205,13 @@ impl State { } } + if let Some(max_size) = max_size { + if max_size == 0 { + tracing::debug!("No capacity"); + return None; + } + } + // Pad prefill_token_budget to be a multiple of block size let prefill_token_budget = ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size; diff --git a/router/src/infer/v2/scheduler.rs b/router/src/infer/v2/scheduler.rs index 3d6c36cf..cc333674 100644 --- a/router/src/infer/v2/scheduler.rs +++ b/router/src/infer/v2/scheduler.rs @@ -161,8 +161,8 @@ pub(crate) async fn batching_task( }; let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); - let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize); - + let max_size = + max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize)); // Try to get a new batch if let Some((mut new_entries, new_batch, span)) = queue .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)