import time from datetime import datetime from llm_server import opts from llm_server.database import sum_column from llm_server.helpers import deep_sort from llm_server.llm.info import get_running_model from llm_server.netdata import get_power_states from llm_server.routes.cache import redis from llm_server.routes.queue import priority_queue from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time # TODO: have routes/__init__.py point to the latest API version generate_stats() def generate_stats(): model_name, error = get_running_model() # will return False when the fetch fails if isinstance(model_name, bool): online = False else: online = True opts.running_model = model_name # t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change # if len(t) == 0: # estimated_wait = 0 # else: # waits = [elapsed for end, elapsed in t] # estimated_wait = int(sum(waits) / len(waits)) proompters_in_queue = len(priority_queue) + get_active_gen_workers() average_tps = float(redis.get('average_tps')) if opts.average_generation_time_mode == 'database': average_generation_time = int(float(redis.get('average_generation_elapsed_sec'))) average_output_tokens = int(float(redis.get('average_output_tokens'))) estimated_wait_sec = int(((average_output_tokens / average_tps) * proompters_in_queue) / opts.concurrent_gens) elif opts.average_generation_time_mode == 'minute': average_generation_time = int(calculate_avg_gen_time()) estimated_wait_sec = int((average_generation_time * proompters_in_queue) / opts.concurrent_gens) else: raise Exception if opts.netdata_root: netdata_stats = get_power_states() else: netdata_stats = {} output = { 'stats': { 'proompts_in_queue': proompters_in_queue, 'proompters_1_min': SemaphoreCheckerThread.proompters_1_min, 'proompts': get_total_proompts() if opts.show_num_prompts else None, 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, 'average_generation_elapsed_sec': average_generation_time, 'average_tps': average_tps, 'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None, 'nvidia': netdata_stats }, 'online': online, 'endpoints': { 'blocking': opts.full_client_api, }, 'estimated_wait_sec': estimated_wait_sec, 'timestamp': int(time.time()), 'config': { 'gatekeeper': 'none' if opts.auth_required is False else 'token', 'context_size': opts.context_size, 'queue_size': opts.concurrent_gens, 'model': model_name, 'mode': opts.mode, }, 'keys': { 'openaiKeys': '∞', 'anthropicKeys': '∞', }, } return deep_sort(output)