Merge cluster to master #3
|
@ -50,7 +50,7 @@ def do_stream(ws, model_name):
|
||||||
token=handler.token,
|
token=handler.token,
|
||||||
prompt=input_prompt,
|
prompt=input_prompt,
|
||||||
response=quitting_err_msg,
|
response=quitting_err_msg,
|
||||||
gen_time=elapsed_time,
|
gen_time=None,
|
||||||
parameters=handler.parameters,
|
parameters=handler.parameters,
|
||||||
headers=r_headers,
|
headers=r_headers,
|
||||||
backend_response_code=response_status_code,
|
backend_response_code=response_status_code,
|
||||||
|
|
|
@ -24,17 +24,9 @@ from llm_server.routes.server_error import handle_server_error
|
||||||
from llm_server.routes.v1 import bp
|
from llm_server.routes.v1 import bp
|
||||||
from llm_server.sock import init_socketio
|
from llm_server.sock import init_socketio
|
||||||
|
|
||||||
# TODO: per-backend workers
|
|
||||||
# TODO: allow setting concurrent gens per-backend
|
|
||||||
# TODO: set the max tokens to that of the lowest backend
|
|
||||||
# TODO: implement RRD backend loadbalancer option
|
|
||||||
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
|
|
||||||
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
|
||||||
# TODO: use coloredlogs
|
|
||||||
# TODO: need to update opts. for workers
|
# TODO: need to update opts. for workers
|
||||||
# TODO: add a healthcheck to VLLM
|
# TODO: add a healthcheck to VLLM
|
||||||
# TODO: allow choosing the model by the URL path
|
|
||||||
# TODO: have VLLM report context size, uptime
|
|
||||||
|
|
||||||
# Lower priority
|
# Lower priority
|
||||||
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.
|
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.
|
||||||
|
|
Reference in New Issue