From da20d1807b1f419304d6604b5fdf60cfd0154281 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Thu, 5 Oct 2023 20:17:42 -0600 Subject: [PATCH] actually wait again --- llm_server/workers/inferencer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index a545ae6..05a2981 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -19,6 +19,8 @@ def worker(): if not selected_model: selected_model = backend_info['model'] + need_to_wait(backend_url) + increment_ip_count(client_ip, 'processing_ips') incr_active_workers(selected_model, backend_url) @@ -56,3 +58,16 @@ def start_workers(num_workers: int): t.start() i += 1 print(f'Started {i} inference workers.') + + +def need_to_wait(backend_url: str): + # We need to check the number of active workers since the streaming endpoint may be doing something. + active_workers = redis.get(f'active_gen_workers:{backend_url}', 0, dtype=int) + concurrent_gens = cluster_config.get_backend(backend_url).get('concurrent_gens', 1) + s = time.time() + print(active_workers) + while active_workers >= concurrent_gens: + time.sleep(0.01) + e = time.time() + if e - s > 0.1: + print(f'Worker was delayed {e - s} seconds.')