More tests.
This commit is contained in:
parent
1352f70847
commit
13e6d522b7
|
@ -0,0 +1,94 @@
|
||||||
|
import { check } from 'k6';
|
||||||
|
import { scenario } from 'k6/execution';
|
||||||
|
import http from 'k6/http';
|
||||||
|
import { Trend, Counter } from 'k6/metrics';
|
||||||
|
|
||||||
|
const host = __ENV.HOST;
|
||||||
|
const model_id = __ENV.MODEL_ID;
|
||||||
|
const timePerToken = new Trend('time_per_token', true);
|
||||||
|
const tokens = new Counter('tokens');
|
||||||
|
const new_tokens = new Counter('new_tokens');
|
||||||
|
const input_tokens = new Counter('input_tokens');
|
||||||
|
const max_new_tokens = 50;
|
||||||
|
|
||||||
|
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
||||||
|
const shareGPT = JSON.parse(open("long.json"))
|
||||||
|
|
||||||
|
|
||||||
|
export function get_options() {
|
||||||
|
return {
|
||||||
|
thresholds: {
|
||||||
|
http_req_failed: ['rate==0'],
|
||||||
|
// time_per_token: [{
|
||||||
|
// threshold: `p(50)<${5 * reference_latency_ms}`,
|
||||||
|
// abortOnFail: true,
|
||||||
|
// delayAbortEval: '10s'
|
||||||
|
// }],
|
||||||
|
},
|
||||||
|
scenarios: {
|
||||||
|
// single_user: {
|
||||||
|
// executor: 'constant-arrival-rate',
|
||||||
|
// duration: '60s',
|
||||||
|
// preAllocatedVUs: 1,
|
||||||
|
// rate: 20,
|
||||||
|
// timeUnit: '1s',
|
||||||
|
// },
|
||||||
|
// load_test: {
|
||||||
|
// executor: 'constant-arrival-rate',
|
||||||
|
// duration: '60s',
|
||||||
|
// preAllocatedVUs: 100,
|
||||||
|
// rate: 1,
|
||||||
|
// timeUnit: '1s',
|
||||||
|
// },
|
||||||
|
// breakpoint: {
|
||||||
|
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
||||||
|
// preAllocatedVUs: 300,
|
||||||
|
// stages: [
|
||||||
|
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
||||||
|
// ],
|
||||||
|
// },
|
||||||
|
throughput: {
|
||||||
|
executor: 'shared-iterations',
|
||||||
|
vus: 10,
|
||||||
|
iterations: 10,
|
||||||
|
maxDuration: '120s',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function generate_payload(gpt, max_new_tokens) {
|
||||||
|
const input = gpt["conversations"][0]["value"];
|
||||||
|
return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
|
||||||
|
}
|
||||||
|
|
||||||
|
export const options = get_options();
|
||||||
|
|
||||||
|
export default function run() {
|
||||||
|
const headers = { 'Content-Type': 'application/json' };
|
||||||
|
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
|
||||||
|
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
||||||
|
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
|
||||||
|
headers,
|
||||||
|
});
|
||||||
|
if (res.status >= 400 && res.status < 500) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
check(res, {
|
||||||
|
'Post status is 200': (res) => res.status === 200,
|
||||||
|
});
|
||||||
|
const duration = res.timings.duration;
|
||||||
|
|
||||||
|
if (res.status === 200) {
|
||||||
|
const body = res.json();
|
||||||
|
const completion_tokens = body.usage.completion_tokens;
|
||||||
|
const latency_ms_per_token = duration / completion_tokens;
|
||||||
|
timePerToken.add(latency_ms_per_token);
|
||||||
|
const prompt_tokens = body.usage.prompt_tokens;
|
||||||
|
input_tokens.add(prompt_tokens);
|
||||||
|
new_tokens.add(completion_tokens);
|
||||||
|
tokens.add(completion_tokens + prompt_tokens);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
import datasets
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
dataset = datasets.load_dataset("ccdv/govreport-summarization")
|
||||||
|
max_new_tokens = 50
|
||||||
|
|
||||||
|
|
||||||
|
conversations = []
|
||||||
|
|
||||||
|
for i, item in enumerate(dataset["test"]):
|
||||||
|
report = item["report"]
|
||||||
|
|
||||||
|
messages = [{"from": "human", "value": f"Summarize this report: ```{report}```"}]
|
||||||
|
|
||||||
|
conversations.append({"conversations": messages})
|
||||||
|
|
||||||
|
with open("long.json", "w") as f:
|
||||||
|
json.dump(conversations, f, indent=4)
|
|
@ -0,0 +1,22 @@
|
||||||
|
# https://www.gutenberg.org/cache/epub/103/pg103.txt
|
||||||
|
from openai import OpenAI
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
|
||||||
|
if not os.path.exists("pg103.txt"):
|
||||||
|
response = requests.get("https://www.gutenberg.org/cache/epub/103/pg103.txt")
|
||||||
|
with open("pg103.txt", "w") as f:
|
||||||
|
f.write(response.text)
|
||||||
|
|
||||||
|
|
||||||
|
length = 130000
|
||||||
|
with open("pg103.txt", "r") as f:
|
||||||
|
data = f.read()
|
||||||
|
|
||||||
|
messages = [{"role": "user", "content": data[: length * 4]}]
|
||||||
|
|
||||||
|
client = OpenAI(base_url="http://localhost:8000/v1", api_key="w")
|
||||||
|
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="meta-llama/Llama-3.1-8B-Instruct", messages=messages, max_tokens=2
|
||||||
|
)
|
Loading…
Reference in New Issue