import time from threading import Thread from llm_server.cluster.backend import test_backend from llm_server.cluster.cluster_config import cluster_config from llm_server.cluster.stores import redis_running_models def cluster_worker(): counter = 0 while True: test_prompt = False if counter % 4 == 0: # Only send a test prompt every 120 seconds. test_prompt = True threads = [] for n, v in cluster_config.all().items(): thread = Thread(target=check_backend, args=(n, v, test_prompt)) thread.start() threads.append(thread) for thread in threads: thread.join() time.sleep(15) counter += 1 def check_backend(n, v, test_prompt): online, backend_info = test_backend(v['backend_url'], test_prompt=test_prompt) # purge_backend_from_running_models(n) if online: running_model = backend_info['model'] for k, v in backend_info.items(): cluster_config.set_backend_value(n, k, v) redis_running_models.sadd(running_model, n) else: for model in redis_running_models.keys(): redis_running_models.srem(model, n) # redis_running_models.srem(backend_info['model'], n) # backend_cycler_store.lrem(backend_info['model'], 1, n) cluster_config.set_backend_value(n, 'online', online)