try to cancel inference if disconnected from client
This commit is contained in:
parent
e42f2b6819
commit
e1d3fca6d3
|
@ -137,7 +137,10 @@ def stream(ws):
|
||||||
'text': new
|
'text': new
|
||||||
}))
|
}))
|
||||||
except:
|
except:
|
||||||
# The client closed the stream.
|
# The has client closed the stream.
|
||||||
|
if request:
|
||||||
|
request.close()
|
||||||
|
ws.close()
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
elapsed_time = end_time - start_time
|
elapsed_time = end_time - start_time
|
||||||
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, r_headers, response_status_code, r_url, response_tokens=tokenize(generated_text))
|
log_prompt(handler.client_ip, handler.token, input_prompt, generated_text, elapsed_time, handler.parameters, r_headers, response_status_code, r_url, response_tokens=tokenize(generated_text))
|
||||||
|
@ -161,8 +164,10 @@ def stream(ws):
|
||||||
'message_num': message_num,
|
'message_num': message_num,
|
||||||
'text': generated_text
|
'text': generated_text
|
||||||
}))
|
}))
|
||||||
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
if request:
|
||||||
|
request.close()
|
||||||
ws.close()
|
ws.close()
|
||||||
|
log_in_bg(generated_text, is_error=True, status_code=response_status_code)
|
||||||
return
|
return
|
||||||
finally:
|
finally:
|
||||||
# The worker incremented it, we'll decrement it.
|
# The worker incremented it, we'll decrement it.
|
||||||
|
|
|
@ -25,7 +25,7 @@ from llm_server.routes.server_error import handle_server_error
|
||||||
from llm_server.routes.v1 import bp
|
from llm_server.routes.v1 import bp
|
||||||
from llm_server.stream import init_socketio
|
from llm_server.stream import init_socketio
|
||||||
|
|
||||||
# TODO: have the workers handle streaming too
|
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.
|
||||||
# TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail
|
# TODO: add backend fallbacks. Backends at the bottom of the list are higher priority and are fallbacks if the upper ones fail
|
||||||
# TODO: implement background thread to test backends via sending test prompts
|
# TODO: implement background thread to test backends via sending test prompts
|
||||||
# TODO: if backend fails request, mark it as down
|
# TODO: if backend fails request, mark it as down
|
||||||
|
|
Reference in New Issue