diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index 07de40e..0aff9ac 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -1,6 +1,7 @@ import threading import time +from llm_server.cluster.backend import get_a_cluster_backend from llm_server.cluster.cluster_config import cluster_config from llm_server.custom_redis import redis from llm_server.llm.generator import generator @@ -10,8 +11,17 @@ from llm_server.routes.queue import DataEvent, decr_active_workers, decrement_ip def worker(): while True: (request_json_body, client_ip, token, parameters, backend_url), event_id, selected_model = priority_queue.get() + backend_info = cluster_config.get_backend(backend_url) + + if not backend_info['online']: + old = backend_url + backend_url = get_a_cluster_backend() + backend_info = cluster_config.get_backend(backend_url) + print(f'Backend {old} offline. Request was redirected to {backend_url}') + del old + if not selected_model: - selected_model = cluster_config.get_backend(backend_url)['model'] + selected_model = backend_info['model'] # This wait time is "invisible", meaning the worker may as # well be still waiting to get an item from the queue.