From 4f226ae38ebb4ecfc0879417996314e33e8e53f8 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Mon, 2 Oct 2023 11:11:48 -0600 Subject: [PATCH] handle requests to offline backends --- llm_server/workers/inferencer.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index 07de40e..0aff9ac 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -1,6 +1,7 @@ import threading import time +from llm_server.cluster.backend import get_a_cluster_backend from llm_server.cluster.cluster_config import cluster_config from llm_server.custom_redis import redis from llm_server.llm.generator import generator @@ -10,8 +11,17 @@ from llm_server.routes.queue import DataEvent, decr_active_workers, decrement_ip def worker(): while True: (request_json_body, client_ip, token, parameters, backend_url), event_id, selected_model = priority_queue.get() + backend_info = cluster_config.get_backend(backend_url) + + if not backend_info['online']: + old = backend_url + backend_url = get_a_cluster_backend() + backend_info = cluster_config.get_backend(backend_url) + print(f'Backend {old} offline. Request was redirected to {backend_url}') + del old + if not selected_model: - selected_model = cluster_config.get_backend(backend_url)['model'] + selected_model = backend_info['model'] # This wait time is "invisible", meaning the worker may as # well be still waiting to get an item from the queue.