allow to fix paged attention num blocks

2024-06-05 10:05:04 +00:00 · 2024-06-05 10:05:04 +00:00 · bb37321b9f
parent 9a59ebcec3
commit bb37321b9f
1 changed files with 11 additions and 6 deletions
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -808,12 +808,17 @@ class FlashCausalLM(Model):
        free_memory = get_free_memory(self.device, MEMORY_FRACTION)
-        num_blocks = (
+        if os.environ.get("NUM_BLOCKS") is None:
-            # Leave 5% for some wiggle room
+            num_blocks = (
-            int((free_memory * 0.95) // total_cache_size)
+                # Leave 5% for some wiggle room
-            # Add batch.blocks as we allocated it above, so it is included in the peak memory.
+                int((free_memory * 0.95) // total_cache_size)
-            + cache_manager.num_blocks
+                # Add batch.blocks as we allocated it above, so it is included in the peak memory.
-        )
+                + cache_manager.num_blocks
            )
        else:
            num_blocks = int(os.environ["NUM_BLOCKS"])
        logger.debug(f"Paged attention num_blocks: {num_blocks}")
        del batch
        del cache_manager