From c4422e567811051e40469551c58a8078158a6656 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 18 Aug 2023 19:28:56 +0200 Subject: [PATCH] Adding small benchmark script. (#881) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. --- load_tests/common.js | 64 ++++++++++++++++++++++++++++++++++++++++++++ load_tests/tgi.js | 17 ++++++++++++ load_tests/vllm.js | 17 ++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 load_tests/common.js create mode 100644 load_tests/tgi.js create mode 100644 load_tests/vllm.js diff --git a/load_tests/common.js b/load_tests/common.js new file mode 100644 index 00000000..be812e9b --- /dev/null +++ b/load_tests/common.js @@ -0,0 +1,64 @@ +import { check, randomSeed } from 'k6'; +import http from 'k6/http'; +import { Trend, Counter } from 'k6/metrics'; +import { randomItem } from 'https://jslib.k6.io/k6-utils/1.2.0/index.js'; + +const seed = 0; + +const host = __ENV.HOST || '127.0.0.1:8000'; +const timePerToken = new Trend('time_per_token', true); +const throughput = new Counter('tokens_per_s'); + +randomSeed(seed); +// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json")) +const shareGPT = JSON.parse(open("small.json")) + + +export function get_options(reference_latency_ms){ + return { + thresholds: { + http_req_failed: ['rate==0'], + time_per_token: [{ + threshold: `p(50)<${3 * reference_latency_ms}`, + abortOnFail: true, + delayAbortEval: '10s' + }], + }, + scenarios: { + load_test: { + executor: 'constant-arrival-rate', + duration: '60s', + preAllocatedVUs: 100, + rate: 10, + timeUnit: '1s', + }, + }, + }; +} + + +export function run(host, generate_payload, max_new_tokens) { + const headers = {'Content-Type': 'application/json'}; + const query = randomItem(shareGPT); + const payload = JSON.stringify(generate_payload(query)); + const res = http.post(`http://${host}/generate`, payload, { + headers, + }); + if(res.status >= 400 && res.status < 500){ + return; + } + + check(res, { + 'Post status is 200': (r) => res.status === 200, + }); + const n_tokens = max_new_tokens; + const timings = res.timings.duration; + + if (res.status === 200) { + const latency_ms_per_token = timings / n_tokens; + timePerToken.add(latency_ms_per_token); + const latency_in_s = latency_ms_per_token / 1000; + const individual_throughput = 1 / latency_in_s; + throughput.add(individual_throughput); + } +} diff --git a/load_tests/tgi.js b/load_tests/tgi.js new file mode 100644 index 00000000..93a0e278 --- /dev/null +++ b/load_tests/tgi.js @@ -0,0 +1,17 @@ +import { get_options, run } from "./common.js"; + +const reference_latency_ms = 30; +const host = __ENV.HOST || '127.0.0.1:8000'; +const max_new_tokens = 50; + + +function generate_payload(gpt){ + const input = gpt["conversations"][0]["value"]; + return {"inputs": input, "parameters": {"max_new_tokens": max_new_tokens, "temperature" : 0.5}} +} + +export const options = get_options(reference_latency_ms); + +export default function(){ + run(host, generate_payload, max_new_tokens); +} diff --git a/load_tests/vllm.js b/load_tests/vllm.js new file mode 100644 index 00000000..fcb38262 --- /dev/null +++ b/load_tests/vllm.js @@ -0,0 +1,17 @@ +import { get_options, run } from "./common.js"; + +const reference_latency_ms = 22; +const host = __ENV.HOST || '127.0.0.1:8000'; +const max_new_tokens = 50; + + +function generate_payload(gpt){ + const input = gpt["conversations"][0]["value"]; + return {"prompt": input, "temperature": 0.5, "ignore_eos": true} +} + +export const options = get_options(reference_latency_ms); + +export default function(){ + run(host, generate_payload, max_new_tokens); +}