import time import requests from llm_server.cluster.cluster_config import get_backends, cluster_config from llm_server.config.global_config import GlobalConfig from llm_server.custom_redis import redis from llm_server.database.database import weighted_average_column_for_model from llm_server.llm.info import get_info from llm_server.logging import create_logger from llm_server.routes.queue import RedisPriorityQueue, priority_queue def main_background_thread(): logger = create_logger('main_bg') while True: online, offline = get_backends() for backend_url in online: backend_info = cluster_config.get_backend(backend_url) backend_mode = backend_info['mode'] backend_info = get_info(backend_url, backend_mode) running_model = backend_info.get('model') if not running_model: continue average_generation_elapsed_sec, average_output_tokens, estimated_avg_tps = calc_stats_for_backend(backend_url, running_model, backend_mode) if average_generation_elapsed_sec: # returns None on exception cluster_config.set_backend_value(backend_url, 'average_generation_elapsed_sec', average_generation_elapsed_sec) if average_output_tokens: cluster_config.set_backend_value(backend_url, 'average_output_tokens', average_output_tokens) if average_generation_elapsed_sec and average_output_tokens: cluster_config.set_backend_value(backend_url, 'estimated_avg_tps', estimated_avg_tps) if GlobalConfig.get().background_homepage_cacher: try: base_client_api = redis.get('base_client_api', dtype=str) r = requests.get('https://' + base_client_api, timeout=5) except Exception as e: logger.error(f'Failed fetch the homepage - {e.__class__.__name__}: {e}') backends = priority_queue.get_backends() for backend_url in backends: queue = RedisPriorityQueue(backend_url) queue.cleanup() time.sleep(30) def calc_stats_for_backend(backend_url, running_model, backend_mode): # exclude_zeros=True filters out rows where an error message was returned. Previously, if there was an error, 0 # was entered into the column. The new code enters null instead but we need to be backwards compatible for now. average_generation_elapsed_sec = weighted_average_column_for_model('messages', 'generation_time', running_model, backend_mode, backend_url, exclude_zeros=True, include_system_tokens=GlobalConfig.get().include_system_tokens_in_stats) or 0 average_output_tokens = weighted_average_column_for_model('messages', 'response_tokens', running_model, backend_mode, backend_url, exclude_zeros=True, include_system_tokens=GlobalConfig.get().include_system_tokens_in_stats) or 0 estimated_avg_tps = round(average_output_tokens / average_generation_elapsed_sec, 2) if average_generation_elapsed_sec > 0 else 0 # Avoid division by zero return average_generation_elapsed_sec, average_output_tokens, estimated_avg_tps