diff --git a/llm_server/routes/openai/chat_completions.py b/llm_server/routes/openai/chat_completions.py index 5b2a83b..4069475 100644 --- a/llm_server/routes/openai/chat_completions.py +++ b/llm_server/routes/openai/chat_completions.py @@ -65,25 +65,27 @@ def openai_chat_completions(): json_obj = json.loads(json_str.decode()) new = json_obj['text'][0].split(handler.prompt + generated_text)[1] generated_text = generated_text + new - data = { - "id": f"chatcmpl-{generate_oai_string(30)}", - "object": "chat.completion.chunk", - "created": int(time.time()), - "model": model, - "choices": [ - { - "index": 0, - "delta": { - "content": new - }, - "finish_reason": None - } - ] - } - yield f'data: {json.dumps(data)}\n\n' except IndexError: + # ???? continue + data = { + "id": f"chatcmpl-{generate_oai_string(30)}", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": model, + "choices": [ + { + "index": 0, + "delta": { + "content": new + }, + "finish_reason": None + } + ] + } + yield f'data: {json.dumps(data)}\n\n' + yield 'data: [DONE]\n\n' end_time = time.time() elapsed_time = end_time - start_time diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index ed112f6..2e2c188 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -9,7 +9,7 @@ from ..helpers.client import format_sillytavern_err from ..helpers.http import require_api_key, validate_json from ..ooba_request_handler import OobaRequestHandler from ... import opts -from ...database.database import increment_token_uses, log_prompt +from ...database.database import log_prompt from ...llm.generator import generator from ...llm.vllm import tokenize from ...stream import sock @@ -81,7 +81,9 @@ def stream(ws): for json_str in json_strs: if json_str: try: + json_obj = json.loads(json_str.decode()) new = json_obj['text'][0].split(input_prompt + generated_text)[1] + generated_text = generated_text + new except IndexError: # ???? continue @@ -92,8 +94,6 @@ def stream(ws): 'text': new })) message_num += 1 - - generated_text = generated_text + new partial_response = b'' # Reset the partial response # If there is no more data, break the loop