diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 2cdc49a0..e4b060ca 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -1386,7 +1386,8 @@ class FlashCausalLM(Model): total_cuda_graph_memory = free_memory_post_alloc - last_available_memory log_master( logger.info, - f"Total memory used for CUDA graphs: {total_cuda_graph_memory/1024/1024:.2f} MB", + f"Total memory used for CUDA graphs: {total_cuda_graph_memory/1024/1024:.2f} MB" + f"\nTotal memory available: {last_available_memory/1024/1024:.2f} MB", ) except torch.cuda.OutOfMemoryError: logger.exception("Decode cuda graph warmup failed")