Updating the benchmarks so everyone uses openai compat layer. (#1800)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
2024-04-25 15:42:17 +02:00 · 2024-04-25 15:42:17 +02:00 · fccf5edf45
parent 0acac5cb7a
commit fccf5edf45
3 changed files with 53 additions and 64 deletions
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -1,71 +1,94 @@
-import { check, randomSeed } from 'k6';
+import { check } from 'k6';
+import { scenario } from 'k6/execution';
 import http from 'k6/http';
 import { Trend, Counter } from 'k6/metrics';
-import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js';

-const seed = 0;
-
-const host = __ENV.HOST || '127.0.0.1:8000';
+const host = __ENV.HOST;
+const model_id = __ENV.MODEL_ID;
 const timePerToken = new Trend('time_per_token', true);
 const tokens = new Counter('tokens');
 const new_tokens = new Counter('new_tokens');
 const input_tokens = new Counter('input_tokens');
+const max_new_tokens = 50;

-randomSeed(seed);
 // const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
 const shareGPT = JSON.parse(open("small.json"))


-export function get_options(reference_latency_ms){
+export function get_options() {
    return {
        thresholds: {
            http_req_failed: ['rate==0'],
-            time_per_token: [{
-                threshold: `p(50)<${5 * reference_latency_ms}`,
-                abortOnFail: true,
-                delayAbortEval: '10s'
-            }],
+            // time_per_token: [{
+            //     threshold: `p(50)<${5 * reference_latency_ms}`,
+            //     abortOnFail: true,
+            //     delayAbortEval: '10s'
+            // }],
        },
        scenarios: {
-            load_test: {
+            single_user: {
                executor: 'constant-arrival-rate',
                duration: '60s',
-                preAllocatedVUs: 10,
-                rate: 10,
+                preAllocatedVUs: 1,
+                rate: 1,
                timeUnit: '1s',
            },
+            // load_test: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 100,
+            //     rate: 1,
+            //     timeUnit: '1s',
+            // },
+            // breakpoint: {
+            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
+            //     preAllocatedVUs: 1000,
+            //     stages: [
+            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
+            //     ],
+            // },
+            // throughput: {
+            //     executor: 'shared-iterations',
+            //     vus: 100,
+            //     iterations: 200,
+            //     maxDuration: '40s',
+            // },
        },
    };
 }

+function generate_payload(gpt, max_new_tokens) {
+    const input = gpt["conversations"][0]["value"];
+    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
+}

-export function run(host, generate_payload, max_new_tokens) {
-    const headers = {'Content-Type': 'application/json'};
-    const query = randomItem(shareGPT);
-    const payload = JSON.stringify(generate_payload(query));
-    const res = http.post(`http://${host}/generate`, payload, {
+export const options = get_options();
+
+export default function run() {
+    const headers = { 'Content-Type': 'application/json' };
+    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
+    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
+    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
        headers,
    });
-    if(res.status >= 400 && res.status < 500){
+    if (res.status >= 400 && res.status < 500) {
        return;
    }


    check(res, {
-        'Post status is 200': (r) => res.status === 200,
+        'Post status is 200': (res) => res.status === 200,
    });
    const duration = res.timings.duration;

    if (res.status === 200) {
        const body = res.json();
-        const n_tokens = body.details.tokens.length;
-        const latency_ms_per_token = duration / n_tokens;
+        const completion_tokens = body.usage.completion_tokens;
+        const latency_ms_per_token = duration / completion_tokens;
        timePerToken.add(latency_ms_per_token);
-        const latency_in_s = latency_ms_per_token / 1000;
-        const individual_throughput = 1 / latency_in_s;
-        const _input_tokens = body.details.prefill.length;
-        tokens.add(n_tokens + _input_tokens);
-        input_tokens.add(_input_tokens);
-        new_tokens.add(n_tokens);
+        const prompt_tokens = body.usage.prompt_tokens;
+        input_tokens.add(prompt_tokens);
+        new_tokens.add(completion_tokens);
+        tokens.add(completion_tokens + prompt_tokens);
    }
 }
--- a/load_tests/tgi.js
+++ b/load_tests/tgi.js
@ -1,17 +0,0 @@
-import { get_options, run } from "./common.js";
-
-const reference_latency_ms = 70;
-const host = __ENV.HOST || '127.0.0.1:8000';
-const max_new_tokens = 50;
-
-
-function generate_payload(gpt){
-    const input = gpt["conversations"][0]["value"];
-    return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "decoder_input_details": true}}
-}
-
-export const options = get_options(reference_latency_ms);
-
-export default function(){
-    run(host, generate_payload, max_new_tokens);
-}
--- a/load_tests/vllm.js
+++ b/load_tests/vllm.js
@ -1,17 +0,0 @@
-import { get_options, run } from "./common.js";
-
-const reference_latency_ms = 22;
-const host = __ENV.HOST || '127.0.0.1:8000';
-const max_new_tokens = 50;
-
-
-function generate_payload(gpt){
-    const input = gpt["conversations"][0]["value"];
-    return {"prompt": input, "temperature": 0.5, "ignore_eos": true}
-}
-
-export const options = get_options(reference_latency_ms);
-
-export default function(){
-    run(host, generate_payload, max_new_tokens);
-}