72 lines
2.2 KiB
JavaScript
72 lines
2.2 KiB
JavaScript
import { check, randomSeed } from 'k6';
|
|
import http from 'k6/http';
|
|
import { Trend, Counter } from 'k6/metrics';
|
|
import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
|
|
|
|
const seed = 0;
|
|
|
|
const host = __ENV.HOST || '127.0.0.1:8000';
|
|
const timePerToken = new Trend('time_per_token', true);
|
|
const tokens = new Counter('tokens');
|
|
const new_tokens = new Counter('new_tokens');
|
|
const input_tokens = new Counter('input_tokens');
|
|
|
|
randomSeed(seed);
|
|
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
|
const shareGPT = JSON.parse(open("small.json"))
|
|
|
|
|
|
export function get_options(reference_latency_ms){
|
|
return {
|
|
thresholds: {
|
|
http_req_failed: ['rate==0'],
|
|
time_per_token: [{
|
|
threshold: `p(50)<${5 * reference_latency_ms}`,
|
|
abortOnFail: true,
|
|
delayAbortEval: '10s'
|
|
}],
|
|
},
|
|
scenarios: {
|
|
load_test: {
|
|
executor: 'constant-arrival-rate',
|
|
duration: '60s',
|
|
preAllocatedVUs: 10,
|
|
rate: 10,
|
|
timeUnit: '1s',
|
|
},
|
|
},
|
|
};
|
|
}
|
|
|
|
|
|
export function run(host, generate_payload, max_new_tokens) {
|
|
const headers = {'Content-Type': 'application/json'};
|
|
const query = randomItem(shareGPT);
|
|
const payload = JSON.stringify(generate_payload(query));
|
|
const res = http.post(`http://${host}/generate`, payload, {
|
|
headers,
|
|
});
|
|
if(res.status >= 400 && res.status < 500){
|
|
return;
|
|
}
|
|
|
|
|
|
check(res, {
|
|
'Post status is 200': (r) => res.status === 200,
|
|
});
|
|
const duration = res.timings.duration;
|
|
|
|
if (res.status === 200) {
|
|
const body = res.json();
|
|
const n_tokens = body.details.tokens.length;
|
|
const latency_ms_per_token = duration / n_tokens;
|
|
timePerToken.add(latency_ms_per_token);
|
|
const latency_in_s = latency_ms_per_token / 1000;
|
|
const individual_throughput = 1 / latency_in_s;
|
|
const _input_tokens = body.details.prefill.length;
|
|
tokens.add(n_tokens + _input_tokens);
|
|
input_tokens.add(_input_tokens);
|
|
new_tokens.add(n_tokens);
|
|
}
|
|
}
|