feat: add simple ttft load_test

2024-07-02 15:57:01 +00:00 · 2024-07-02 15:57:01 +00:00 · fe3991e857
parent 0759ec495e
commit fe3991e857
1 changed files with 108 additions and 0 deletions
--- a/load_tests/ttft.py
+++ b/load_tests/ttft.py
@ -0,0 +1,108 @@
+import asyncio
+import aiohttp
+import json
+import os
+from time import time
+
+HOST = os.getenv("HOST", "localhost:3000")
+MODEL_ID = os.getenv("MODEL_ID", "default-model")
+NUM_REQUESTS = 10
+MAX_NEW_TOKENS = 100
+TIMEOUT = 30
+
+
+def load_inputs(filename):
+    with open(filename, "r") as f:
+        data = json.load(f)
+
+    inputs = []
+    for item in data:
+        if "conversations" in item:
+            if len(item["conversations"]) > 0:
+                inputs.append(item["conversations"][0]["value"])
+
+    return inputs
+
+
+def generate_payload(input_text):
+    return {
+        "messages": [{"role": "user", "content": input_text}],
+        "temperature": 0,
+        "model": MODEL_ID,
+        "max_tokens": MAX_NEW_TOKENS,
+        "stream": True,
+    }
+
+
+async def benchmark_sse(session, input_text):
+    payload = generate_payload(input_text)
+    start_time = time()
+    first_token_time = None
+
+    try:
+        async with session.post(
+            f"http://{HOST}/v1/chat/completions", json=payload, timeout=TIMEOUT
+        ) as response:
+            async for line in response.content:
+                if line.startswith(b"data:"):
+                    if first_token_time is None:
+                        first_token_time = time()
+                        return (first_token_time - start_time) * 1000
+
+            if first_token_time is None:
+                raise Exception("No SSE data received within the timeout period")
+
+    except asyncio.TimeoutError:
+        raise Exception(f"Request timed out after {TIMEOUT} seconds")
+
+
+async def run_benchmark(inputs, same_input=False):
+    async with aiohttp.ClientSession() as session:
+        tasks = []
+        longest_input = 0
+        for i in range(NUM_REQUESTS):
+            input_text = inputs[0] if same_input else inputs[i % len(inputs)]
+            if len(input_text) > longest_input:
+                longest_input = len(input_text)
+            task = asyncio.create_task(benchmark_sse(session, input_text))
+            tasks.append(task)
+
+        results = []
+        for i, task in enumerate(asyncio.as_completed(tasks), 1):
+            try:
+                time_to_first_event = await task
+                results.append(time_to_first_event)
+                print(
+                    f"Request {i}: Time to first event - {time_to_first_event:.2f}ms longest input: {longest_input}"
+                )
+            except Exception as e:
+                print(f"Request {i} failed: {str(e)}")
+
+    if results:
+        avg_time = sum(results) / len(results)
+        print(f"\nAverage time to first event: {avg_time:.2f}ms")
+    else:
+        print("\nNo successful requests")
+
+    return avg_time if results else None
+
+
+async def main():
+    inputs = load_inputs("small.json")
+
+    print("Running benchmark with same input:")
+    same_input_avg = await run_benchmark(inputs, same_input=True)
+
+    # sleep for a second to avoid the next inputs in the same batch
+    await asyncio.sleep(1)
+
+    print("\nRunning benchmark with different inputs:")
+    different_inputs_avg = await run_benchmark(inputs, same_input=False)
+
+    if same_input_avg and different_inputs_avg:
+        print(f"\nSame input average: {same_input_avg:.2f}ms")
+        print(f"Different inputs average: {different_inputs_avg:.2f}ms")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())