Merge cluster to master #3
|
@ -57,7 +57,6 @@ def openai_chat_completions():
|
||||||
else:
|
else:
|
||||||
handler.prompt = transform_messages_to_prompt(handler.request.json['messages'])
|
handler.prompt = transform_messages_to_prompt(handler.request.json['messages'])
|
||||||
|
|
||||||
generated_text = ''
|
|
||||||
response_status_code = 0
|
response_status_code = 0
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -98,61 +97,62 @@ def openai_chat_completions():
|
||||||
oai_string = generate_oai_string(30)
|
oai_string = generate_oai_string(30)
|
||||||
|
|
||||||
def generate():
|
def generate():
|
||||||
response = generator(msg_to_backend, handler.backend_url)
|
try:
|
||||||
generated_text = ''
|
response = generator(msg_to_backend, handler.backend_url)
|
||||||
partial_response = b''
|
generated_text = ''
|
||||||
for chunk in response.iter_content(chunk_size=1):
|
partial_response = b''
|
||||||
partial_response += chunk
|
for chunk in response.iter_content(chunk_size=1):
|
||||||
if partial_response.endswith(b'\x00'):
|
partial_response += chunk
|
||||||
json_strs = partial_response.split(b'\x00')
|
if partial_response.endswith(b'\x00'):
|
||||||
for json_str in json_strs:
|
json_strs = partial_response.split(b'\x00')
|
||||||
if json_str:
|
for json_str in json_strs:
|
||||||
try:
|
if json_str:
|
||||||
json_obj = json.loads(json_str.decode())
|
try:
|
||||||
new = json_obj['text'][0].split(handler.prompt + generated_text)[1]
|
json_obj = json.loads(json_str.decode())
|
||||||
generated_text = generated_text + new
|
new = json_obj['text'][0].split(handler.prompt + generated_text)[1]
|
||||||
except IndexError:
|
generated_text = generated_text + new
|
||||||
# ????
|
except IndexError:
|
||||||
continue
|
# ????
|
||||||
|
continue
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"id": f"chatcmpl-{oai_string}",
|
"id": f"chatcmpl-{oai_string}",
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
"created": int(time.time()),
|
"created": int(time.time()),
|
||||||
"model": model,
|
"model": model,
|
||||||
"choices": [
|
"choices": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"delta": {
|
"delta": {
|
||||||
"content": new
|
"content": new
|
||||||
},
|
},
|
||||||
"finish_reason": None
|
"finish_reason": None
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
yield f'data: {json.dumps(data)}\n\n'
|
yield f'data: {json.dumps(data)}\n\n'
|
||||||
yield 'data: [DONE]\n\n'
|
yield 'data: [DONE]\n\n'
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
log_to_db(
|
log_to_db(
|
||||||
handler.client_ip,
|
handler.client_ip,
|
||||||
handler.token,
|
handler.token,
|
||||||
handler.prompt,
|
handler.prompt,
|
||||||
generated_text,
|
generated_text,
|
||||||
elapsed_time,
|
elapsed_time,
|
||||||
handler.parameters,
|
handler.parameters,
|
||||||
r_headers,
|
r_headers,
|
||||||
response_status_code,
|
response_status_code,
|
||||||
r_url,
|
r_url,
|
||||||
handler.backend_url,
|
handler.backend_url,
|
||||||
)
|
)
|
||||||
|
finally:
|
||||||
|
# The worker incremented it, we'll decrement it.
|
||||||
|
decrement_ip_count(handler.client_ip, 'processing_ips')
|
||||||
|
decr_active_workers(handler.selected_model, handler.backend_url)
|
||||||
|
print(len(generated_text))
|
||||||
|
|
||||||
return Response(generate(), mimetype='text/event-stream')
|
return Response(generate(), mimetype='text/event-stream')
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return 'INTERNAL SERVER', 500
|
return 'INTERNAL SERVER', 500
|
||||||
finally:
|
|
||||||
# The worker incremented it, we'll decrement it.
|
|
||||||
decrement_ip_count(handler.client_ip, 'processing_ips')
|
|
||||||
decr_active_workers(handler.selected_model, handler.backend_url)
|
|
||||||
print(len(generated_text))
|
|
||||||
|
|
Reference in New Issue