diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index 26028ab..178bfd6 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -12,17 +12,10 @@ def worker(): (request_json_body, client_ip, token, parameters, backend_url), event_id, selected_model = priority_queue.get() if not backend_url: backend_url = get_a_cluster_backend(selected_model) + else: + backend_url = cluster_config.validate_backend(backend_url) backend_info = cluster_config.get_backend(backend_url) - # The backend could have died between when the request was - # submitted and now, so let's double check it's still online. - if not backend_info['online']: - old = backend_url - backend_url = get_a_cluster_backend() - backend_info = cluster_config.get_backend(backend_url) - print(f'Backend {old} offline. Request was redirected to {backend_url}') - del old # gc - if not selected_model: selected_model = backend_info['model'] @@ -67,7 +60,7 @@ def need_to_wait(backend_url: str): active_workers = redis.get(f'active_gen_workers:{backend_url}', 0, dtype=int) concurrent_gens = cluster_config.get_backend(backend_url).get('concurrent_gens', 1) s = time.time() - print(active_workers >= concurrent_gens, active_workers, concurrent_gens) + print(active_workers) while active_workers >= concurrent_gens: time.sleep(0.01) e = time.time() diff --git a/server.py b/server.py index 37c254b..c3ed4a2 100644 --- a/server.py +++ b/server.py @@ -30,6 +30,7 @@ from llm_server.routes.v1 import bp from llm_server.routes.v1.generate_stats import generate_stats from llm_server.sock import init_socketio +# TODO: queue item timeout # TODO: return an `error: True`, error code, and error message rather than just a formatted message # TODO: what happens when all backends are offline? What about the "online" key in the stats page? # TODO: redis SCAN vs KEYS??