diff --git a/load_tests/common.js b/load_tests/common.js new file mode 100644 index 00000000..be812e9b --- /dev/null +++ b/load_tests/common.js @@ -0,0 +1,64 @@ +import { check, randomSeed } from 'k6'; +import http from 'k6/http'; +import { Trend, Counter } from 'k6/metrics'; +import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js'; + +const seed = 0; + +const host = __ENV.HOST || '127.0.0.1:8000'; +const timePerToken = new Trend('time_per_token', true); +const throughput = new Counter('tokens_per_s'); + +randomSeed(seed); +// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json")) +const shareGPT = JSON.parse(open("small.json")) + + +export function get_options(reference_latency_ms){ + return { + thresholds: { + http_req_failed: ['rate==0'], + time_per_token: [{ + threshold: `p(50)<${3 * reference_latency_ms}`, + abortOnFail: true, + delayAbortEval: '10s' + }], + }, + scenarios: { + load_test: { + executor: 'constant-arrival-rate', + duration: '60s', + preAllocatedVUs: 100, + rate: 10, + timeUnit: '1s', + }, + }, + }; +} + + +export function run(host, generate_payload, max_new_tokens) { + const headers = {'Content-Type': 'application/json'}; + const query = randomItem(shareGPT); + const payload = JSON.stringify(generate_payload(query)); + const res = http.post(`http://${host}/generate`, payload, { + headers, + }); + if(res.status >= 400 && res.status < 500){ + return; + } + + check(res, { + 'Post status is 200': (r) => res.status === 200, + }); + const n_tokens = max_new_tokens; + const timings = res.timings.duration; + + if (res.status === 200) { + const latency_ms_per_token = timings / n_tokens; + timePerToken.add(latency_ms_per_token); + const latency_in_s = latency_ms_per_token / 1000; + const individual_throughput = 1 / latency_in_s; + throughput.add(individual_throughput); + } +} diff --git a/load_tests/tgi.js b/load_tests/tgi.js new file mode 100644 index 00000000..93a0e278 --- /dev/null +++ b/load_tests/tgi.js @@ -0,0 +1,17 @@ +import { get_options, run } from "./common.js"; + +const reference_latency_ms = 30; +const host = __ENV.HOST || '127.0.0.1:8000'; +const max_new_tokens = 50; + + +function generate_payload(gpt){ + const input = gpt["conversations"][0]["value"]; + return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}} +} + +export const options = get_options(reference_latency_ms); + +export default function(){ + run(host, generate_payload, max_new_tokens); +} diff --git a/load_tests/vllm.js b/load_tests/vllm.js new file mode 100644 index 00000000..fcb38262 --- /dev/null +++ b/load_tests/vllm.js @@ -0,0 +1,17 @@ +import { get_options, run } from "./common.js"; + +const reference_latency_ms = 22; +const host = __ENV.HOST || '127.0.0.1:8000'; +const max_new_tokens = 50; + + +function generate_payload(gpt){ + const input = gpt["conversations"][0]["value"]; + return {"prompt": input, "temperature": 0.5, "ignore_eos": true} +} + +export const options = get_options(reference_latency_ms); + +export default function(){ + run(host, generate_payload, max_new_tokens); +}