diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index f6e978b..0fc8f40 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -137,7 +137,10 @@ def stream(ws): 'text': new })) except: - # The client closed the stream. + # The has client closed the stream. + if request: + request.close() + ws.close() end_time = time.time() elapsed_time = end_time - start_time log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, r_headers, response_status_code, r_url, response_tokens=tokenize(generated_text)) @@ -161,8 +164,10 @@ def stream(ws): 'message_num': message_num, 'text': generated_text })) - log_in_bg(generated_text, is_error=True, status_code=response_status_code) + if request: + request.close() ws.close() + log_in_bg(generated_text, is_error=True, status_code=response_status_code) return finally: # The worker incremented it, we'll decrement it. diff --git a/server.py b/server.py index 1010e09..06482d4 100644 --- a/server.py +++ b/server.py @@ -25,7 +25,7 @@ from llm_server.routes.server_error import handle_server_error from llm_server.routes.v1 import bp from llm_server.stream import init_socketio -# TODO: have the workers handle streaming too +# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation. # TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail # TODO: implement background thread to test backends via sending test prompts # TODO: if backend fails request, mark it as down