Adding small benchmark script. (#881)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
2023-08-18 19:28:56 +02:00 · 2023-08-18 19:28:56 +02:00 · c4422e5678
parent bfa070611d
commit c4422e5678
3 changed files with 98 additions and 0 deletions
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -0,0 +1,64 @@
+import { check, randomSeed } from 'k6';
+import http from 'k6/http';
+import { Trend, Counter } from 'k6/metrics';
+import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';
+
+const seed = 0;
+
+const host = __ENV.HOST || '127.0.0.1:8000';
+const timePerToken = new Trend('time_per_token', true);
+const throughput = new Counter('tokens_per_s');
+
+randomSeed(seed);
+// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
+const shareGPT = JSON.parse(open("small.json"))
+
+
+export function get_options(reference_latency_ms){
+    return {
+        thresholds: {
+            http_req_failed: ['rate==0'],
+            time_per_token: [{
+                threshold: `p(50)<${3 * reference_latency_ms}`,
+                abortOnFail: true,
+                delayAbortEval: '10s'
+            }],
+        },
+        scenarios: {
+            load_test: {
+                executor: 'constant-arrival-rate',
+                duration: '60s',
+                preAllocatedVUs: 100,
+                rate: 10,
+                timeUnit: '1s',
+            },
+        },
+    };
+}
+
+
+export function run(host, generate_payload, max_new_tokens) {
+    const headers = {'Content-Type': 'application/json'};
+    const query = randomItem(shareGPT);
+    const payload = JSON.stringify(generate_payload(query));
+    const res = http.post(`http://${host}/generate`, payload, {
+        headers,
+    });
+    if(res.status >= 400 && res.status < 500){
+        return;
+    }
+
+    check(res, {
+        'Post status is 200': (r) => res.status === 200,
+    });
+    const n_tokens = max_new_tokens;
+    const timings = res.timings.duration;
+
+    if (res.status === 200) {
+        const latency_ms_per_token = timings / n_tokens;
+        timePerToken.add(latency_ms_per_token);
+        const latency_in_s = latency_ms_per_token / 1000;
+        const individual_throughput = 1 / latency_in_s;
+        throughput.add(individual_throughput);
+    }
+}
--- a/load_tests/tgi.js
+++ b/load_tests/tgi.js
@ -0,0 +1,17 @@
+import { get_options, run } from "./common.js";
+ 
+const reference_latency_ms = 30;
+const host = __ENV.HOST || '127.0.0.1:8000';
+const max_new_tokens = 50;
+
+
+function generate_payload(gpt){
+    const input = gpt["conversations"][0]["value"];
+    return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}}
+}
+
+export const options = get_options(reference_latency_ms);
+
+export default function(){
+    run(host, generate_payload, max_new_tokens);
+}
--- a/load_tests/vllm.js
+++ b/load_tests/vllm.js
@ -0,0 +1,17 @@
+import { get_options, run } from "./common.js";
+ 
+const reference_latency_ms = 22;
+const host = __ENV.HOST || '127.0.0.1:8000';
+const max_new_tokens = 50;
+
+
+function generate_payload(gpt){
+    const input = gpt["conversations"][0]["value"];
+    return {"prompt": input, "temperature": 0.5, "ignore_eos": true}
+}
+
+export const options = get_options(reference_latency_ms);
+
+export default function(){
+    run(host, generate_payload, max_new_tokens);
+}