Fix prefix caching for chat completion since we removed logprobs.

2024-12-02 07:51:00 +01:00 · 2024-12-02 07:51:00 +01:00 · 1352f70847
parent db1114955a
commit 1352f70847
6 changed files with 157 additions and 5 deletions
--- a/load_tests/Makefile
+++ b/load_tests/Makefile
@ -0,0 +1,9 @@
+
+ShareGPT_V3_unfiltered_cleaned_split.json:
+	wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
+	python filter.py
+
+prepare_orca:
+	python orca.py
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -0,0 +1,94 @@
+import { check } from 'k6';
+import { scenario } from 'k6/execution';
+import http from 'k6/http';
+import { Trend, Counter } from 'k6/metrics';
+
+const host = __ENV.HOST;
+const model_id = __ENV.MODEL_ID;
+const timePerToken = new Trend('time_per_token', true);
+const tokens = new Counter('tokens');
+const new_tokens = new Counter('new_tokens');
+const input_tokens = new Counter('input_tokens');
+const max_new_tokens = 50;
+
+// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
+const shareGPT = JSON.parse(open("small.json"))
+
+
+export function get_options() {
+    return {
+        thresholds: {
+            http_req_failed: ['rate==0'],
+            // time_per_token: [{
+            //     threshold: `p(50)<${5 * reference_latency_ms}`,
+            //     abortOnFail: true,
+            //     delayAbortEval: '10s'
+            // }],
+        },
+        scenarios: {
+            // single_user: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 1,
+            //     rate: 20,
+            //     timeUnit: '1s',
+            // },
+            // load_test: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 100,
+            //     rate: 1,
+            //     timeUnit: '1s',
+            // },
+            // breakpoint: {
+            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
+            //     preAllocatedVUs: 300,
+            //     stages: [
+            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
+            //     ],
+            // },
+            throughput: {
+                executor: 'shared-iterations',
+                vus: 100,
+                iterations: 200,
+                maxDuration: '40s',
+            },
+        },
+    };
+}
+
+function generate_payload(gpt, max_new_tokens) {
+    const input = gpt["conversations"][0]["value"];
+    return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
+}
+
+export const options = get_options();
+
+export default function run() {
+    const headers = { 'Content-Type': 'application/json' };
+    const query = shareGPT[scenario.iterationInTest % shareGPT.length];
+    const payload = JSON.stringify(generate_payload(query, max_new_tokens));
+    const res = http.post(`http://${host}/v1/chat/completions`, payload, {
+        headers,
+    });
+    if (res.status >= 400 && res.status < 500) {
+        return;
+    }
+
+
+    check(res, {
+        'Post status is 200': (res) => res.status === 200,
+    });
+    const duration = res.timings.duration;
+
+    if (res.status === 200) {
+        const body = res.json();
+        const completion_tokens = body.usage.completion_tokens;
+        const latency_ms_per_token = duration / completion_tokens;
+        timePerToken.add(latency_ms_per_token);
+        const prompt_tokens = body.usage.prompt_tokens;
+        input_tokens.add(prompt_tokens);
+        new_tokens.add(completion_tokens);
+        tokens.add(completion_tokens + prompt_tokens);
+    }
+}
--- a/load_tests/filter.py
+++ b/load_tests/filter.py
@ -0,0 +1,26 @@
+import json
+
+
+def main():
+    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
+        data = json.load(f)
+
+    # Select only the first 2k conversations that start with a human.
+    max = 2000
+    conversations = []
+    for conversation in data:
+        conv = conversation.get("conversations")
+        if conv and conv[0]["from"] == "human":
+            # Trim the rest of the output
+            conversation["conversations"] = conversation["conversations"][:1]
+            conversations.append(conversation)
+
+            if len(conversation) >= max:
+                break
+
+    with open("./small.json", "w") as f:
+        data = json.dump(conversations, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
--- a/load_tests/orca.py
+++ b/load_tests/orca.py
@ -0,0 +1,27 @@
+import json
+import datasets
+import tqdm
+
+
+def main():
+    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
+    # Select only the first 2k conversations that start with a human.
+    max = min(2000, len(dataset))
+    conversations = []
+    for item in tqdm.tqdm(dataset, total=max):
+        conversation = {
+            "conversations": [
+                {"from": "human", "value": item["question"]},
+            ],
+            "id": item["id"],
+        }
+        conversations.append(conversation)
+        if len(conversations) >= max:
+            break
+
+    with open("./small.json", "w") as f:
+        json.dump(conversations, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -923,7 +923,6 @@ impl ChatRequest {
            messages,
            seed,
            stop,
-            stream,
            tools,
            tool_choice,
            tool_prompt,
@ -1003,7 +1002,7 @@ impl ChatRequest {
                    truncate: None,
                    watermark: false,
                    details: true,
-                    decoder_input_details: !stream,
+                    decoder_input_details: false,
                    seed,
                    top_n_tokens: top_logprobs,
                    grammar,
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1560,9 +1560,6 @@ class FlashCausalLM(Model):
            batch_num_blocks = batch.num_blocks

            num_tokens = batch.to_pb().current_tokens
-            logger.info(f"BLOCKS {batch.num_blocks}")
-            free_memory = get_free_memory(self.device, MEMORY_FRACTION)
-            logger.info(f"Free memory {free_memory}")
            if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
                torch.cuda.tunable.tuning_enable(False)
            _, _batch, _ = self.generate_token(batch)