Trying to fix non chunking targets.

2024-10-23 15:02:52 +08:00 · 2024-10-23 15:02:52 +08:00 · 0a01dde986
parent a31db04709
commit 0a01dde986
1 changed files with 25 additions and 15 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1398,6 +1398,7 @@ class FlashCausalLM(Model):
        total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size

        if max_total_tokens is None:
+            if get_support_chunking():
                model_max_length = self.tokenizer.model_max_length
                free_memory = get_free_memory(self.device, MEMORY_FRACTION)
                spare_blocks = (
@ -1411,9 +1412,18 @@ class FlashCausalLM(Model):
                batch.num_blocks = available_blocks
                batch.max_blocks = available_blocks
                max_input_tokens = (
-                available_blocks - 1 if max_input_tokens is None else max_input_tokens
+                    available_blocks - 1
+                    if max_input_tokens is None
+                    else max_input_tokens
                )
                max_total_tokens = available_blocks
+            else:
+                max_total_tokens = batch.num_blocks
+                max_input_tokens = (
+                    batch.num_blocks - 1
+                    if max_input_tokens is None
+                    else max_input_tokens
+                )

        try:
            self.init_kv_cache(