From 13e6d522b7d72aa08daf893a4fb709627ac90542 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 3 Dec 2024 19:05:36 +0100
Subject: [PATCH] More tests.

---
 load_tests/long.js         | 94 ++++++++++++++++++++++++++++++++++++++
 load_tests/long.py         | 19 ++++++++
 load_tests/long_prompt2.py | 22 +++++++++
 3 files changed, 135 insertions(+)
 create mode 100644 load_tests/long.js
 create mode 100644 load_tests/long.py
 create mode 100644 load_tests/long_prompt2.py

diff --git a/load_tests/long.js b/load_tests/long.js
new file mode 100644
index 00000000..2aec977e
--- /dev/null
+++ b/load_tests/long.js
@@ -0,0 +1,94 @@
+import { check } from 'k6';
+import { scenario } from 'k6/execution';
+import http from 'k6/http';
+import { Trend, Counter } from 'k6/metrics';
+
+const host = __ENV.HOST;
+const model_id = __ENV.MODEL_ID;
+const timePerToken = new Trend('time_per_token', true);
+const tokens = new Counter('tokens');
+const new_tokens = new Counter('new_tokens');
+const input_tokens = new Counter('input_tokens');
+const max_new_tokens = 50;
+
+// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
+const shareGPT = JSON.parse(open("long.json"))
+
+
+export function get_options() {
+    return {
+        thresholds: {
+            http_req_failed: ['rate==0'],
+            // time_per_token: [{
+            //     threshold: `p(50)<${5 * reference_latency_ms}`,
+            //     abortOnFail: true,
+            //     delayAbortEval: '10s'
+            // }],
+        },
+        scenarios: {
+            // single_user: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 1,
+            //     rate: 20,
+            //     timeUnit: '1s',
+            // },
+            // load_test: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 100,
+            //     rate: 1,
+            //     timeUnit: '1s',
+            // },
+            // breakpoint: {
+            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
+            //     preAllocatedVUs: 300,
+            //     stages: [
+            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
+            //     ],
+            // },
+            throughput: {
+                executor: 'shared-iterations',
+                vus: 10,
+                iterations: 10,
+                maxDuration: '120s',
+            },
+        },
+    };
+}
+
+function generate_payload(gpt, max_new_tokens) {
+    const input = gpt["conversations"][0]["value"];
+    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
+}
+
+export const options = get_options();
+
+export default function run() {
+    const headers = { 'Content-Type': 'application/json' };
+    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
+    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
+    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
+        headers,
+    });
+    if (res.status >= 400 && res.status < 500) {
+        return;
+    }
+
+
+    check(res, {
+        'Post status is 200': (res) => res.status === 200,
+    });
+    const duration = res.timings.duration;
+
+    if (res.status === 200) {
+        const body = res.json();
+        const completion_tokens = body.usage.completion_tokens;
+        const latency_ms_per_token = duration / completion_tokens;
+        timePerToken.add(latency_ms_per_token);
+        const prompt_tokens = body.usage.prompt_tokens;
+        input_tokens.add(prompt_tokens);
+        new_tokens.add(completion_tokens);
+        tokens.add(completion_tokens + prompt_tokens);
+    }
+}
diff --git a/load_tests/long.py b/load_tests/long.py
new file mode 100644
index 00000000..653064ef
--- /dev/null
+++ b/load_tests/long.py
@@ -0,0 +1,19 @@
+import datasets
+import json
+
+
+dataset = datasets.load_dataset("ccdv/govreport-summarization")
+max_new_tokens = 50
+
+
+conversations = []
+
+for i, item in enumerate(dataset["test"]):
+    report = item["report"]
+
+    messages = [{"from": "human", "value": f"Summarize this report: ```{report}```"}]
+
+    conversations.append({"conversations": messages})
+
+with open("long.json", "w") as f:
+    json.dump(conversations, f, indent=4)
diff --git a/load_tests/long_prompt2.py b/load_tests/long_prompt2.py
new file mode 100644
index 00000000..d65aa1cf
--- /dev/null
+++ b/load_tests/long_prompt2.py
@@ -0,0 +1,22 @@
+# https://www.gutenberg.org/cache/epub/103/pg103.txt
+from openai import OpenAI
+import os
+import requests
+
+if not os.path.exists("pg103.txt"):
+    response = requests.get("https://www.gutenberg.org/cache/epub/103/pg103.txt")
+    with open("pg103.txt", "w") as f:
+        f.write(response.text)
+
+
+length = 130000
+with open("pg103.txt", "r") as f:
+    data = f.read()
+
+messages = [{"role": "user", "content": data[: length * 4]}]
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="w")
+
+completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.1-8B-Instruct", messages=messages, max_tokens=2
+)