Merge cluster to master #3

Merged
cyberes merged 163 commits from cluster into master 2023-10-27 19:19:22 -06:00
2 changed files with 1 additions and 9 deletions
Showing only changes of commit 93d19fb95b - Show all commits

View File

@ -50,7 +50,7 @@ def do_stream(ws, model_name):
token=handler.token,
prompt=input_prompt,
response=quitting_err_msg,
gen_time=elapsed_time,
gen_time=None,
parameters=handler.parameters,
headers=r_headers,
backend_response_code=response_status_code,

View File

@ -24,17 +24,9 @@ from llm_server.routes.server_error import handle_server_error
from llm_server.routes.v1 import bp
from llm_server.sock import init_socketio
# TODO: per-backend workers
# TODO: allow setting concurrent gens per-backend
# TODO: set the max tokens to that of the lowest backend
# TODO: implement RRD backend loadbalancer option
# TODO: have VLLM reject a request if it already has n == concurrent_gens running
# TODO: add a way to cancel VLLM gens. Maybe use websockets?
# TODO: use coloredlogs
# TODO: need to update opts. for workers
# TODO: add a healthcheck to VLLM
# TODO: allow choosing the model by the URL path
# TODO: have VLLM report context size, uptime
# Lower priority
# TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.