try to cancel inference if disconnected from client
This commit is contained in:
parent
e42f2b6819
commit
e1d3fca6d3
|
@ -137,7 +137,10 @@ def stream(ws):
|
|||
'text': new
|
||||
}))
|
||||
except:
|
||||
# The client closed the stream.
|
||||
# The has client closed the stream.
|
||||
if request:
|
||||
request.close()
|
||||
ws.close()
|
||||
end_time = time.time()
|
||||
elapsed_time = end_time - start_time
|
||||
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, r_headers, response_status_code, r_url, response_tokens=tokenize(generated_text))
|
||||
|
@ -161,8 +164,10 @@ def stream(ws):
|
|||
'message_num': message_num,
|
||||
'text': generated_text
|
||||
}))
|
||||
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
||||
if request:
|
||||
request.close()
|
||||
ws.close()
|
||||
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
||||
return
|
||||
finally:
|
||||
# The worker incremented it, we'll decrement it.
|
||||
|
|
|
@ -25,7 +25,7 @@ from llm_server.routes.server_error import handle_server_error
|
|||
from llm_server.routes.v1 import bp
|
||||
from llm_server.stream import init_socketio
|
||||
|
||||
# TODO: have the workers handle streaming too
|
||||
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.
|
||||
# TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail
|
||||
# TODO: implement background thread to test backends via sending test prompts
|
||||
# TODO: if backend fails request, mark it as down
|
||||
|
|
Reference in New Issue