import time from datetime import datetime from llm_server import opts from llm_server.database.database import get_distinct_ips_24h, sum_column from llm_server.helpers import deep_sort, round_up_base from llm_server.llm.info import get_running_model from llm_server.netdata import get_power_states from llm_server.routes.cache import redis from llm_server.routes.queue import priority_queue from llm_server.routes.stats import get_active_gen_workers, get_total_proompts, server_start_time def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers): if active_gen_workers < concurrent_gens: return 0 elif active_gen_workers >= concurrent_gens: # Calculate how long it will take to complete the currently running gens and the queued requests. # If the proompters in the queue are equal to the number of workers, just use the calculated generation time. # Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round # that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally, # Add gen_time_calc to the time to account for the currently running generations. # This assumes that all active workers will finish at the same time, which is unlikely. # Regardless, this is the most accurate estimate we can get without tracking worker elapsed times. proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \ else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc) return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0 elif proompters_in_queue == 0 and active_gen_workers == 0: # No queue, no workers return 0 else: # No queue return gen_time_calc # TODO: have routes/__init__.py point to the latest API version generate_stats() def generate_stats(regen: bool = False): if not regen: c = redis.get('proxy_stats', dict) if c: return c model_name, error = get_running_model() # will return False when the fetch fails if isinstance(model_name, bool): online = False else: online = True redis.set('running_model', model_name) # t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change # if len(t) == 0: # estimated_wait = 0 # else: # waits = [elapsed for end, elapsed in t] # estimated_wait = int(sum(waits) / len(waits)) active_gen_workers = get_active_gen_workers() proompters_in_queue = len(priority_queue) # This is so wildly inaccurate it's disabled until I implement stats reporting into VLLM. # estimated_avg_tps = redis.get('estimated_avg_tps', float, default=0) average_generation_time = redis.get('average_generation_elapsed_sec', float, default=0) estimated_wait_sec = calculate_wait_time(average_generation_time, proompters_in_queue, opts.concurrent_gens, active_gen_workers) if opts.netdata_root: netdata_stats = {} power_states = get_power_states() for gpu, power_state in power_states.items(): netdata_stats[gpu] = { 'power_state': power_state, # 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu'))) } else: netdata_stats = {} base_client_api = redis.get('base_client_api', str) proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf')) output = { 'stats': { 'proompters': { '5_min': proompters_5_min, '24_hrs': get_distinct_ips_24h(), }, 'proompts_total': get_total_proompts() if opts.show_num_prompts else None, 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, 'average_generation_elapsed_sec': int(average_generation_time), # 'estimated_avg_tps': estimated_avg_tps, 'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None, }, 'online': online, 'endpoints': { 'blocking': f'https://{base_client_api}', 'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None, }, 'queue': { 'processing': active_gen_workers, 'queued': proompters_in_queue, 'estimated_wait_sec': int(estimated_wait_sec), }, 'timestamp': int(time.time()), 'config': { 'gatekeeper': 'none' if opts.auth_required is False else 'token', 'context_size': opts.context_size, 'concurrent': opts.concurrent_gens, 'model': opts.manual_model_name if opts.manual_model_name else model_name, 'mode': opts.mode, 'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip, }, 'keys': { 'openaiKeys': '∞', 'anthropicKeys': '∞', }, 'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None, 'nvidia': netdata_stats } result = deep_sort(output) # It may take a bit to get the base client API, so don't cache until then. if base_client_api: redis.set_dict('proxy_stats', result) # Cache with no expiry return result