try to cancel inference if disconnected from client

This commit is contained in:
Cyberes 2023-09-28 09:55:31 -06:00
parent e42f2b6819
commit e1d3fca6d3
2 changed files with 8 additions and 3 deletions

View File

@ -137,7 +137,10 @@ def stream(ws):
'text': new
}))
except:
# The client closed the stream.
# The has client closed the stream.
if request:
request.close()
ws.close()
end_time = time.time()
elapsed_time = end_time - start_time
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, r_headers, response_status_code, r_url, response_tokens=tokenize(generated_text))
@ -161,8 +164,10 @@ def stream(ws):
'message_num': message_num,
'text': generated_text
}))
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
if request:
request.close()
ws.close()
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
return
finally:
# The worker incremented it, we'll decrement it.

View File

@ -25,7 +25,7 @@ from llm_server.routes.server_error import handle_server_error
from llm_server.routes.v1 import bp
from llm_server.stream import init_socketio
# TODO: have the workers handle streaming too
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.
# TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail
# TODO: implement background thread to test backends via sending test prompts
# TODO: if backend fails request, mark it as down