From 6d06473cf48e19e7382b27940f993a5f48c83997 Mon Sep 17 00:00:00 2001
From: drbh <david.richard.holtz@gmail.com>
Date: Fri, 9 Aug 2024 04:54:32 -0400
Subject: [PATCH] Pr 2352 ci branch (#2382)

* Fix unsigned integer underflow

Passing --max-batch-size to the launcher actually had no effect
because after a few requests the max_size passed to State::next_batch
would underflow becoming a largo positive number.

In the scheduler, as soon as the cached batch size reached the
max_batch_size the max_size passed to next_batch becomes 0.
Since the only check in that funcion is
```
if Some(batch_requests.len()) == max_size {
    break;
}
```
and it's called after the `batch_requests.len()` has
become 1, it doesn't do anything to prevent more than 0
requests from being batched.

Now we have cached batch in the server that is large than
max_batch_size and `max_size - batch_size as usize`
underflows.

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>

* fix: update v3 scheduler and ensure max_batch_size > 0

---------

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
---
 backends/v3/src/backend.rs       | 3 ++-
 backends/v3/src/main.rs          | 8 ++++++++
 backends/v3/src/queue.rs         | 7 +++++++
 router/src/infer/v2/queue.rs     | 7 +++++++
 router/src/infer/v2/scheduler.rs | 4 ++--
 5 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
index d82355de..6b3e0526 100644
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -168,7 +168,8 @@ pub(crate) async fn batching_task(
                 };
 
                 let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
+                let max_size =
+                    max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
 
                 // Try to get a new batch
                 if let Some((mut new_entries, new_batch, span)) = queue
diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
index 21952e66..471ddb5a 100644
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@@ -150,6 +150,14 @@ async fn main() -> Result<(), RouterError> {
         }
     }
 
+    if let Some(max_batch_size) = max_batch_size {
+        if max_batch_size == 0 {
+            return Err(RouterError::ArgumentValidation(
+                "`max_batch_size` must be > 0".to_string(),
+            ));
+        }
+    }
+
     let (backend, _backend_info) = connect_backend(
         max_input_tokens,
         max_total_tokens,
diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index 9427bd60..b457389c 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -226,6 +226,13 @@ impl State {
             }
         }
 
+        if let Some(max_size) = max_size {
+            if max_size == 0 {
+                tracing::debug!("No capacity");
+                return None;
+            }
+        }
+
         // Pad prefill_token_budget to be a multiple of block size
         let prefill_token_budget =
             ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
diff --git a/router/src/infer/v2/queue.rs b/router/src/infer/v2/queue.rs
index 0b51645a..696cbfc8 100644
--- a/router/src/infer/v2/queue.rs
+++ b/router/src/infer/v2/queue.rs
@@ -205,6 +205,13 @@ impl State {
             }
         }
 
+        if let Some(max_size) = max_size {
+            if max_size == 0 {
+                tracing::debug!("No capacity");
+                return None;
+            }
+        }
+
         // Pad prefill_token_budget to be a multiple of block size
         let prefill_token_budget =
             ((prefill_token_budget + self.block_size - 1) / self.block_size) * self.block_size;
diff --git a/router/src/infer/v2/scheduler.rs b/router/src/infer/v2/scheduler.rs
index 3d6c36cf..cc333674 100644
--- a/router/src/infer/v2/scheduler.rs
+++ b/router/src/infer/v2/scheduler.rs
@@ -161,8 +161,8 @@ pub(crate) async fn batching_task(
                 };
 
                 let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size = max_batch_size.map(|max_size| max_size - batch_size as usize);
-
+                let max_size =
+                    max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
                 // Try to get a new batch
                 if let Some((mut new_entries, new_batch, span)) = queue
                     .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)