diff --git a/load_tests/long.js b/load_tests/long.js new file mode 100644 index 00000000..2aec977e --- /dev/null +++ b/load_tests/long.js @@ -0,0 +1,94 @@ +import { check } from 'k6'; +import { scenario } from 'k6/execution'; +import http from 'k6/http'; +import { Trend, Counter } from 'k6/metrics'; + +const host = __ENV.HOST; +const model_id = __ENV.MODEL_ID; +const timePerToken = new Trend('time_per_token', true); +const tokens = new Counter('tokens'); +const new_tokens = new Counter('new_tokens'); +const input_tokens = new Counter('input_tokens'); +const max_new_tokens = 50; + +// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json")) +const shareGPT = JSON.parse(open("long.json")) + + +export function get_options() { + return { + thresholds: { + http_req_failed: ['rate==0'], + // time_per_token: [{ + // threshold: `p(50)<${5 * reference_latency_ms}`, + // abortOnFail: true, + // delayAbortEval: '10s' + // }], + }, + scenarios: { + // single_user: { + // executor: 'constant-arrival-rate', + // duration: '60s', + // preAllocatedVUs: 1, + // rate: 20, + // timeUnit: '1s', + // }, + // load_test: { + // executor: 'constant-arrival-rate', + // duration: '60s', + // preAllocatedVUs: 100, + // rate: 1, + // timeUnit: '1s', + // }, + // breakpoint: { + // executor: 'ramping-arrival-rate', //Assure load increase if the system slows + // preAllocatedVUs: 300, + // stages: [ + // { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load + // ], + // }, + throughput: { + executor: 'shared-iterations', + vus: 10, + iterations: 10, + maxDuration: '120s', + }, + }, + }; +} + +function generate_payload(gpt, max_new_tokens) { + const input = gpt["conversations"][0]["value"]; + return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens } +} + +export const options = get_options(); + +export default function run() { + const headers = { 'Content-Type': 'application/json' }; + const query = shareGPT[scenario.iterationInTest % shareGPT.length]; + const payload = JSON.stringify(generate_payload(query, max_new_tokens)); + const res = http.post(`http://${host}/v1/chat/completions`, payload, { + headers, + }); + if (res.status >= 400 && res.status < 500) { + return; + } + + + check(res, { + 'Post status is 200': (res) => res.status === 200, + }); + const duration = res.timings.duration; + + if (res.status === 200) { + const body = res.json(); + const completion_tokens = body.usage.completion_tokens; + const latency_ms_per_token = duration / completion_tokens; + timePerToken.add(latency_ms_per_token); + const prompt_tokens = body.usage.prompt_tokens; + input_tokens.add(prompt_tokens); + new_tokens.add(completion_tokens); + tokens.add(completion_tokens + prompt_tokens); + } +} diff --git a/load_tests/long.py b/load_tests/long.py new file mode 100644 index 00000000..653064ef --- /dev/null +++ b/load_tests/long.py @@ -0,0 +1,19 @@ +import datasets +import json + + +dataset = datasets.load_dataset("ccdv/govreport-summarization") +max_new_tokens = 50 + + +conversations = [] + +for i, item in enumerate(dataset["test"]): + report = item["report"] + + messages = [{"from": "human", "value": f"Summarize this report: ```{report}```"}] + + conversations.append({"conversations": messages}) + +with open("long.json", "w") as f: + json.dump(conversations, f, indent=4) diff --git a/load_tests/long_prompt2.py b/load_tests/long_prompt2.py new file mode 100644 index 00000000..d65aa1cf --- /dev/null +++ b/load_tests/long_prompt2.py @@ -0,0 +1,22 @@ +# https://www.gutenberg.org/cache/epub/103/pg103.txt +from openai import OpenAI +import os +import requests + +if not os.path.exists("pg103.txt"): + response = requests.get("https://www.gutenberg.org/cache/epub/103/pg103.txt") + with open("pg103.txt", "w") as f: + f.write(response.text) + + +length = 130000 +with open("pg103.txt", "r") as f: + data = f.read() + +messages = [{"role": "user", "content": data[: length * 4]}] + +client = OpenAI(base_url="http://localhost:8000/v1", api_key="w") + +completion = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", messages=messages, max_tokens=2 +)