feat: add simple ttft load_test

2024-07-02 15:57:01 +00:00 · 2024-07-02 15:57:01 +00:00 · fe3991e857
parent 0759ec495e
commit fe3991e857
1 changed files with 108 additions and 0 deletions
--- a/load_tests/ttft.py
+++ b/load_tests/ttft.py
@ -0,0 +1,108 @@
 import asyncio
 import aiohttp
 import json
 import os
 from time import time
 HOST = os.getenv("HOST", "localhost:3000")
 MODEL_ID = os.getenv("MODEL_ID", "default-model")
 NUM_REQUESTS = 10
 MAX_NEW_TOKENS = 100
 TIMEOUT = 30
 def load_inputs(filename):
    with open(filename, "r") as f:
        data = json.load(f)
    inputs = []
    for item in data:
        if "conversations" in item:
            if len(item["conversations"]) > 0:
                inputs.append(item["conversations"][0]["value"])
    return inputs
 def generate_payload(input_text):
    return {
        "messages": [{"role": "user", "content": input_text}],
        "temperature": 0,
        "model": MODEL_ID,
        "max_tokens": MAX_NEW_TOKENS,
        "stream": True,
    }
 async def benchmark_sse(session, input_text):
    payload = generate_payload(input_text)
    start_time = time()
    first_token_time = None
    try:
        async with session.post(
            f"http://{HOST}/v1/chat/completions", json=payload, timeout=TIMEOUT
        ) as response:
            async for line in response.content:
                if line.startswith(b"data:"):
                    if first_token_time is None:
                        first_token_time = time()
                        return (first_token_time - start_time) * 1000
            if first_token_time is None:
                raise Exception("No SSE data received within the timeout period")
    except asyncio.TimeoutError:
        raise Exception(f"Request timed out after {TIMEOUT} seconds")
 async def run_benchmark(inputs, same_input=False):
    async with aiohttp.ClientSession() as session:
        tasks = []
        longest_input = 0
        for i in range(NUM_REQUESTS):
            input_text = inputs[0] if same_input else inputs[i % len(inputs)]
            if len(input_text) > longest_input:
                longest_input = len(input_text)
            task = asyncio.create_task(benchmark_sse(session, input_text))
            tasks.append(task)
        results = []
        for i, task in enumerate(asyncio.as_completed(tasks), 1):
            try:
                time_to_first_event = await task
                results.append(time_to_first_event)
                print(
                    f"Request {i}: Time to first event - {time_to_first_event:.2f}ms longest input: {longest_input}"
                )
            except Exception as e:
                print(f"Request {i} failed: {str(e)}")
    if results:
        avg_time = sum(results) / len(results)
        print(f"\nAverage time to first event: {avg_time:.2f}ms")
    else:
        print("\nNo successful requests")
    return avg_time if results else None
 async def main():
    inputs = load_inputs("small.json")
    print("Running benchmark with same input:")
    same_input_avg = await run_benchmark(inputs, same_input=True)
    # sleep for a second to avoid the next inputs in the same batch
    await asyncio.sleep(1)
    print("\nRunning benchmark with different inputs:")
    different_inputs_avg = await run_benchmark(inputs, same_input=False)
    if same_input_avg and different_inputs_avg:
        print(f"\nSame input average: {same_input_avg:.2f}ms")
        print(f"Different inputs average: {different_inputs_avg:.2f}ms")
 if __name__ == "__main__":
    asyncio.run(main())