From 93d19fb95b9ebf9a8df11e6bbad885eb298122b9 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sun, 1 Oct 2023 10:25:32 -0600 Subject: [PATCH] fix exception --- llm_server/routes/v1/generate_stream.py | 2 +- server.py | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index d6328d1..24d5bc6 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -50,7 +50,7 @@ def do_stream(ws, model_name): token=handler.token, prompt=input_prompt, response=quitting_err_msg, - gen_time=elapsed_time, + gen_time=None, parameters=handler.parameters, headers=r_headers, backend_response_code=response_status_code, diff --git a/server.py b/server.py index 6c06303..71685a4 100644 --- a/server.py +++ b/server.py @@ -24,17 +24,9 @@ from llm_server.routes.server_error import handle_server_error from llm_server.routes.v1 import bp from llm_server.sock import init_socketio -# TODO: per-backend workers -# TODO: allow setting concurrent gens per-backend -# TODO: set the max tokens to that of the lowest backend -# TODO: implement RRD backend loadbalancer option -# TODO: have VLLM reject a request if it already has n == concurrent_gens running # TODO: add a way to cancel VLLM gens. Maybe use websockets? -# TODO: use coloredlogs # TODO: need to update opts. for workers # TODO: add a healthcheck to VLLM -# TODO: allow choosing the model by the URL path -# TODO: have VLLM report context size, uptime # Lower priority # TODO: set VLLM to stream ALL data using socket.io. If the socket disconnects, cancel generation.