diff --git a/llm_server/routes/openai/chat_completions.py b/llm_server/routes/openai/chat_completions.py index d10bdf6..2c13131 100644 --- a/llm_server/routes/openai/chat_completions.py +++ b/llm_server/routes/openai/chat_completions.py @@ -151,6 +151,7 @@ def openai_chat_completions(): # The worker incremented it, we'll decrement it. decrement_ip_count(handler.client_ip, 'processing_ips') decr_active_workers(handler.selected_model, handler.backend_url) + print('cleaned up') return Response(generate(), mimetype='text/event-stream') except Exception: diff --git a/llm_server/workers/inferencer.py b/llm_server/workers/inferencer.py index 4ec85b5..e023c86 100644 --- a/llm_server/workers/inferencer.py +++ b/llm_server/workers/inferencer.py @@ -69,5 +69,5 @@ def need_to_wait(backend_url: str): while active_workers >= concurrent_gens: time.sleep(0.01) e = time.time() - if e - s > 0.5: + if e - s > 0.1: print(f'Worker was delayed {e - s} seconds.')