Fix prefix caching for chat completion since we removed logprobs.
This commit is contained in:
parent
db1114955a
commit
1352f70847
|
@ -0,0 +1,9 @@
|
|||
|
||||
ShareGPT_V3_unfiltered_cleaned_split.json:
|
||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
prepare_share: ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
python filter.py
|
||||
|
||||
prepare_orca:
|
||||
python orca.py
|
|
@ -0,0 +1,94 @@
|
|||
import { check } from 'k6';
|
||||
import { scenario } from 'k6/execution';
|
||||
import http from 'k6/http';
|
||||
import { Trend, Counter } from 'k6/metrics';
|
||||
|
||||
const host = __ENV.HOST;
|
||||
const model_id = __ENV.MODEL_ID;
|
||||
const timePerToken = new Trend('time_per_token', true);
|
||||
const tokens = new Counter('tokens');
|
||||
const new_tokens = new Counter('new_tokens');
|
||||
const input_tokens = new Counter('input_tokens');
|
||||
const max_new_tokens = 50;
|
||||
|
||||
// const shareGPT = JSON.parse(open("ShareGPT_V3_unfiltered_cleaned_split.json"))
|
||||
const shareGPT = JSON.parse(open("small.json"))
|
||||
|
||||
|
||||
export function get_options() {
|
||||
return {
|
||||
thresholds: {
|
||||
http_req_failed: ['rate==0'],
|
||||
// time_per_token: [{
|
||||
// threshold: `p(50)<${5 * reference_latency_ms}`,
|
||||
// abortOnFail: true,
|
||||
// delayAbortEval: '10s'
|
||||
// }],
|
||||
},
|
||||
scenarios: {
|
||||
// single_user: {
|
||||
// executor: 'constant-arrival-rate',
|
||||
// duration: '60s',
|
||||
// preAllocatedVUs: 1,
|
||||
// rate: 20,
|
||||
// timeUnit: '1s',
|
||||
// },
|
||||
// load_test: {
|
||||
// executor: 'constant-arrival-rate',
|
||||
// duration: '60s',
|
||||
// preAllocatedVUs: 100,
|
||||
// rate: 1,
|
||||
// timeUnit: '1s',
|
||||
// },
|
||||
// breakpoint: {
|
||||
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
|
||||
// preAllocatedVUs: 300,
|
||||
// stages: [
|
||||
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
|
||||
// ],
|
||||
// },
|
||||
throughput: {
|
||||
executor: 'shared-iterations',
|
||||
vus: 100,
|
||||
iterations: 200,
|
||||
maxDuration: '40s',
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
function generate_payload(gpt, max_new_tokens) {
|
||||
const input = gpt["conversations"][0]["value"];
|
||||
return { "messages": [{ "role": "user", "content": input }], "temperature": 0, "model": `${model_id}`, "max_tokens": max_new_tokens }
|
||||
}
|
||||
|
||||
export const options = get_options();
|
||||
|
||||
export default function run() {
|
||||
const headers = { 'Content-Type': 'application/json' };
|
||||
const query = shareGPT[scenario.iterationInTest % shareGPT.length];
|
||||
const payload = JSON.stringify(generate_payload(query, max_new_tokens));
|
||||
const res = http.post(`http://${host}/v1/chat/completions`, payload, {
|
||||
headers,
|
||||
});
|
||||
if (res.status >= 400 && res.status < 500) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
check(res, {
|
||||
'Post status is 200': (res) => res.status === 200,
|
||||
});
|
||||
const duration = res.timings.duration;
|
||||
|
||||
if (res.status === 200) {
|
||||
const body = res.json();
|
||||
const completion_tokens = body.usage.completion_tokens;
|
||||
const latency_ms_per_token = duration / completion_tokens;
|
||||
timePerToken.add(latency_ms_per_token);
|
||||
const prompt_tokens = body.usage.prompt_tokens;
|
||||
input_tokens.add(prompt_tokens);
|
||||
new_tokens.add(completion_tokens);
|
||||
tokens.add(completion_tokens + prompt_tokens);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
import json
|
||||
|
||||
|
||||
def main():
|
||||
with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Select only the first 2k conversations that start with a human.
|
||||
max = 2000
|
||||
conversations = []
|
||||
for conversation in data:
|
||||
conv = conversation.get("conversations")
|
||||
if conv and conv[0]["from"] == "human":
|
||||
# Trim the rest of the output
|
||||
conversation["conversations"] = conversation["conversations"][:1]
|
||||
conversations.append(conversation)
|
||||
|
||||
if len(conversation) >= max:
|
||||
break
|
||||
|
||||
with open("./small.json", "w") as f:
|
||||
data = json.dump(conversations, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,27 @@
|
|||
import json
|
||||
import datasets
|
||||
import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
dataset = datasets.load_dataset("Open-Orca/OpenOrca", split="train")
|
||||
# Select only the first 2k conversations that start with a human.
|
||||
max = min(2000, len(dataset))
|
||||
conversations = []
|
||||
for item in tqdm.tqdm(dataset, total=max):
|
||||
conversation = {
|
||||
"conversations": [
|
||||
{"from": "human", "value": item["question"]},
|
||||
],
|
||||
"id": item["id"],
|
||||
}
|
||||
conversations.append(conversation)
|
||||
if len(conversations) >= max:
|
||||
break
|
||||
|
||||
with open("./small.json", "w") as f:
|
||||
json.dump(conversations, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -923,7 +923,6 @@ impl ChatRequest {
|
|||
messages,
|
||||
seed,
|
||||
stop,
|
||||
stream,
|
||||
tools,
|
||||
tool_choice,
|
||||
tool_prompt,
|
||||
|
@ -1003,7 +1002,7 @@ impl ChatRequest {
|
|||
truncate: None,
|
||||
watermark: false,
|
||||
details: true,
|
||||
decoder_input_details: !stream,
|
||||
decoder_input_details: false,
|
||||
seed,
|
||||
top_n_tokens: top_logprobs,
|
||||
grammar,
|
||||
|
|
|
@ -1560,9 +1560,6 @@ class FlashCausalLM(Model):
|
|||
batch_num_blocks = batch.num_blocks
|
||||
|
||||
num_tokens = batch.to_pb().current_tokens
|
||||
logger.info(f"BLOCKS {batch.num_blocks}")
|
||||
free_memory = get_free_memory(self.device, MEMORY_FRACTION)
|
||||
logger.info(f"Free memory {free_memory}")
|
||||
if SYSTEM == "rocm" and os.environ.get("PYTORCH_TUNABLEOP_ENABLED", False):
|
||||
torch.cuda.tunable.tuning_enable(False)
|
||||
_, _batch, _ = self.generate_token(batch)
|
||||
|
|
Loading…
Reference in New Issue