hf_text-generation-inference/load_tests/starcoder_load.js

63 lines
1.8 KiB
JavaScript

import {check} from 'k6';
import http from 'k6/http';
import {Trend} from 'k6/metrics';
const host = __ENV.HOST || '127.0.0.1:3000';
const totalTime = new Trend('total_time', true);
const validationTime = new Trend('validation_time', true);
const queueTime = new Trend('queue_time', true);
const inferenceTime = new Trend('inference_time', true);
const timePerToken = new Trend('time_per_token', true);
const example = {
payload: JSON.stringify({
inputs: '# This is a fibonacci function written in the Python programming language.' +
'def fibonacci',
parameters: {
details: true,
max_new_tokens: 60,
temperature: 0.2,
top_p: 0.95,
seed: 0,
},
}),
generated_tokens: 60
};
export const options = {
thresholds: {
http_req_failed: ['rate==0'],
time_per_token: ['p(95)<90'],
queue_time: ['p(95)<1500'],
},
scenarios: {
load_test: {
executor: 'constant-arrival-rate',
duration: '60s',
preAllocatedVUs: 100,
rate: 10,
timeUnit: '1s',
},
},
};
export default function () {
const headers = {'Content-Type': 'application/json'};
const res = http.post(`http://${host}/generate`, example.payload, {
headers,
});
check(res, {
'Post status is 200': (r) => res.status === 200,
'Post response generated tokens': (r) => res.status === 200 && res.json().details.generated_tokens === example.generated_tokens,
});
if (res.status === 200) {
totalTime.add(res.headers["X-Total-Time"]);
validationTime.add(res.headers["X-Validation-Time"]);
queueTime.add(res.headers["X-Queue-Time"]);
inferenceTime.add(res.headers["X-Inference-Time"]);
timePerToken.add(res.headers["X-Time-Per-Token"]);
}
}