feat: add basic test for the warmup step and memory allocation of the kv cache

2024-10-09 13:49:11 +00:00 · 2024-10-09 13:49:11 +00:00 · a8108bc0da
parent 9ed0c85fe1
commit a8108bc0da
2 changed files with 200 additions and 0 deletions
--- a/server/tests/models/test_flash_causal_lm.py
+++ b/server/tests/models/test_flash_causal_lm.py
@ -0,0 +1,156 @@
+import pytest
+import torch
+
+from transformers import AutoTokenizer
+
+from text_generation_server.pb import generate_pb2
+from text_generation_server.models.flash_causal_lm import (
+    FlashCausalLMBatch,
+    FlashCausalLM,
+)
+from text_generation_server.models.custom_modeling.flash_llama_modeling import (
+    FlashLlamaForCausalLM,
+)
+from text_generation_server.models.globals import set_adapter_to_index
+from text_generation_server.utils.import_utils import SYSTEM, empty_cache, synchronize
+from unittest.mock import Mock
+import base64
+
+model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+
+set_adapter_to_index({})
+
+
+def test_flash_causal_lm_warmup():
+    if SYSTEM == "cuda":
+        flash_causal_lm_warmup()
+    else:
+        pytest.skip("Test only runs on CUDA")
+
+
+def flash_causal_lm_warmup():
+    revision = "main"
+    quantize = None
+    speculator = False
+    dtype = torch.float16
+    kv_cache_dtype = torch.float16
+    trust_remote_code = False
+    lora_adapter_ids = None
+    device = torch.device("cuda:0")
+
+    current_memory = torch.cuda.memory_allocated(device)
+
+    default_causal_lm = FlashCausalLM(
+        model_id=model_id,
+        model_class=FlashLlamaForCausalLM,
+        revision=revision,
+        quantize=quantize,
+        speculator=speculator,
+        dtype=dtype,
+        kv_cache_dtype=kv_cache_dtype,
+        trust_remote_code=trust_remote_code,
+        lora_adapter_ids=lora_adapter_ids,
+    )
+    model_tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+    available_memory_after_model_and_tokenizer = torch.cuda.memory_allocated(device)
+    model_and_tokenizer_memory = (
+        available_memory_after_model_and_tokenizer - current_memory
+    )
+    model_and_tokenizer_memory_mb = model_and_tokenizer_memory / 1024 / 1024
+    print(f"Model and Tokenizer memory: {model_and_tokenizer_memory}")
+
+    default_pb_parameters = generate_pb2.NextTokenChooserParameters(
+        temperature=0.9,
+        top_k=10,
+        top_p=0.9,
+        typical_p=0.9,
+        repetition_penalty=1.2,
+        watermark=True,
+        frequency_penalty=0.1,
+    )
+
+    default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(
+        stop_sequences=[], max_new_tokens=1024, ignore_eos_token=True
+    )
+
+    # define the batches to check and the expected number of blocks and output
+    batches_and_expected = [
+        Mock(
+            num_requests=8,
+            expected_num_blocks=16376,
+            expected_max_supported_total_tokens=22449,
+        ),
+        Mock(
+            num_requests=4,
+            expected_num_blocks=8188,
+            expected_max_supported_total_tokens=30768,
+        ),
+        Mock(
+            num_requests=2,
+            expected_num_blocks=4094,
+            expected_max_supported_total_tokens=40445,
+        ),
+    ]
+    for batch_of_size_n in batches_and_expected:
+        empty_cache()
+        synchronize()
+
+        # build the inputs (similar to prefill used in warmup)
+        inputs_text = "_test " * 1024
+        b64_encoded_image = "iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAIAAAAC64paAAABg2lDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw0AcxV/TSotUROxQxCFDdbKLijjWKhShQqgVWnUwufQLmrQkKS6OgmvBwY/FqoOLs64OroIg+AHi7OCk6CIl/i8ptIjx4Lgf7+497t4BQqvKNDOQADTdMjKppJjLr4rBVwQQwhAERGVm1uckKQ3P8XUPH1/v4jzL+9yfY0AtmAzwicQJVjcs4g3imU2rznmfOMLKskp8Tjxh0AWJH7muuPzGueSwwDMjRjYzTxwhFks9rPQwKxsa8TRxTNV0yhdyLquctzhr1Qbr3JO/MFzQV5a5TnMUKSxiCRJEKGiggiosxGnVSTGRof2kh3/E8UvkUshVASPHAmrQIDt+8D/43a1ZnJp0k8JJoO/Ftj/GgOAu0G7a9vexbbdPAP8zcKV3/bUWMPtJerOrxY6AwW3g4rqrKXvA5Q4QfarLhuxIfppCsQi8n9E35YHhW6B/ze2ts4/TByBLXaVvgINDYLxE2ese7w719vbvmU5/PycecohsjayNAAAACXBIWXMAAC4jAAAuIwF4pT92AAAAB3RJTUUH6AQIEQMnlTSSjwAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAASSURBVDjLY2AYBaNgFIyCoQsABMQAAeRw1DoAAAAASUVORK5CYII="
+        inputs_image = f"![](data:image/jpeg;base64,{b64_encoded_image})"
+        inputs = inputs_text + inputs_image
+        # inputs are also added as chunks to define the type
+        input_chunks = [
+            generate_pb2.InputChunk(text=inputs_text),
+            generate_pb2.InputChunk(
+                image=generate_pb2.Image(
+                    # convert the base64 encoded image to bytes by decoding it
+                    data=base64.b64decode(b64_encoded_image),
+                    mimetype="image/jpeg;base64",
+                )
+            ),
+        ]
+
+        # build a batch of size n requests
+        default_pb_requests = []
+        for i in range(batch_of_size_n.num_requests):
+            req = generate_pb2.Request(
+                id=i,
+                inputs=inputs,
+                input_chunks=generate_pb2.Input(chunks=input_chunks),
+                prefill_logprobs=True,
+                truncate=1024,
+                parameters=default_pb_parameters,
+                stopping_parameters=default_pb_stop_parameters,
+            )
+            default_pb_requests.append(req)
+
+        # convert the list of requests to a FlashCausalLMBatch this will calculate the number of blocks
+        default_pb_batch = generate_pb2.Batch(
+            id=0, requests=default_pb_requests, size=batch_of_size_n.num_requests
+        )
+        default_flash_causal_lm_batch = FlashCausalLMBatch.from_pb(
+            default_pb_batch, model_tokenizer, torch.float16, torch.device("cuda:0")
+        )
+        print("number of blocks", default_flash_causal_lm_batch.num_blocks)
+        assert (
+            default_flash_causal_lm_batch.num_blocks
+            == batch_of_size_n.expected_num_blocks
+        )
+        max_supported_total_tokens = default_causal_lm.warmup(
+            default_flash_causal_lm_batch
+        )
+        print("output", max_supported_total_tokens)
+        assert (
+            max_supported_total_tokens
+            == batch_of_size_n.expected_max_supported_total_tokens
+        )
+        warmup_response = generate_pb2.WarmupResponse(
+            max_supported_total_tokens=max_supported_total_tokens
+        )
+
+
+if __name__ == "__main__":
+    test_flash_causal_lm_warmup()
--- a/server/tests/utils/test_kv_cache.py
+++ b/server/tests/utils/test_kv_cache.py
@ -0,0 +1,44 @@
+import torch
+import pytest
+from text_generation_server.models.globals import ATTENTION, BLOCK_SIZE
+from text_generation_server.layers.attention import KVCache
+from text_generation_server.utils.import_utils import SYSTEM
+
+def test_kvcache_memory():
+    if SYSTEM == "cuda":
+        kvcache_memory()
+    else:
+        pytest.skip("Test only runs on CUDA")
+
+def kvcache_memory():
+    num_blocks = 8188
+    num_kv_heads = 8
+    head_size = 128
+    kv_cache_dtype = torch.float16
+    device = torch.device("cuda:0")
+    num_layers = 32
+
+    current_memory = torch.cuda.memory_allocated(device)
+
+    kv_cache = [
+        KVCache(
+            num_blocks=num_blocks,
+            num_heads=num_kv_heads,
+            head_size=head_size,
+            dtype=kv_cache_dtype,
+            device=device,
+        )
+        for _ in range(num_layers)
+    ]
+
+    available_memory_after_kv_cache = torch.cuda.memory_allocated(device)
+    kv_cache_memory = available_memory_after_kv_cache - current_memory
+    kv_cache_memory_mb = kv_cache_memory / 1024 / 1024
+    
+    print(f"KV Cache memory: {kv_cache_memory}")
+    assert kv_cache_memory_mb > 1023
+    assert kv_cache_memory_mb < 1025
+
+
+if __name__ == "__main__":
+    test_kvcache_memory()