Merge cluster to master #3
|
@ -50,7 +50,7 @@ def do_stream(ws, model_name):
|
|||
token=handler.token,
|
||||
prompt=input_prompt,
|
||||
response=quitting_err_msg,
|
||||
gen_time=elapsed_time,
|
||||
gen_time=None,
|
||||
parameters=handler.parameters,
|
||||
headers=r_headers,
|
||||
backend_response_code=response_status_code,
|
||||
|
|
|
@ -24,17 +24,9 @@ from llm_server.routes.server_error import handle_server_error
|
|||
from llm_server.routes.v1 import bp
|
||||
from llm_server.sock import init_socketio
|
||||
|
||||
# TODO: per-backend workers
|
||||
# TODO: allow setting concurrent gens per-backend
|
||||
# TODO: set the max tokens to that of the lowest backend
|
||||
# TODO: implement RRD backend loadbalancer option
|
||||
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
|
||||
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
||||
# TODO: use coloredlogs
|
||||
# TODO: need to update opts. for workers
|
||||
# TODO: add a healthcheck to VLLM
|
||||
# TODO: allow choosing the model by the URL path
|
||||
# TODO: have VLLM report context size, uptime
|
||||
|
||||
# Lower priority
|
||||
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.
|
||||
|
|
Reference in New Issue