import time from datetime import datetime from llm_server import opts from llm_server.database import get_distinct_ips_24h, sum_column from llm_server.helpers import deep_sort, round_up_base from llm_server.llm.info import get_running_model from llm_server.netdata import get_power_states from llm_server.routes.cache import redis from llm_server.routes.queue import priority_queue from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers): workers_running = gen_time_calc if active_gen_workers > 0 else 0 if proompters_in_queue > 0: # Calculate how long it will take to complete the currently running gens and the queued requests. # If the proompters in the queue are equal to the number of workers, just use the calculated generation time. # Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round # that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally, # Add gen_time_calc to the time to account for the currently running generations. # This assumes that all active workers will finish at the same time, which is unlikely. # Regardless, this is the most accurate estimate we can get without tracking worker elapsed times. proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \ else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc) + workers_running return proompters_in_queue_wait_time elif proompters_in_queue == 0 and active_gen_workers == 0: # No queue, no workers return 0 else: # No queue return gen_time_calc # TODO: have routes/__init__.py point to the latest API version generate_stats() def generate_stats(): model_name, error = get_running_model() # will return False when the fetch fails if isinstance(model_name, bool): online = False else: online = True opts.running_model = model_name # t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change # if len(t) == 0: # estimated_wait = 0 # else: # waits = [elapsed for end, elapsed in t] # estimated_wait = int(sum(waits) / len(waits)) active_gen_workers = get_active_gen_workers() proompters_in_queue = len(priority_queue) estimated_avg_tps = float(redis.get('estimated_avg_tps')) if opts.average_generation_time_mode == 'database': average_generation_time = float(redis.get('average_generation_elapsed_sec')) # average_output_tokens = float(redis.get('average_output_tokens')) # average_generation_time_from_tps = (average_output_tokens / estimated_avg_tps) # What to use in our math that calculates the wait time. # We could use the average TPS but we don't know the exact TPS value, only # the backend knows that. So, let's just stick with the elapsed time. gen_time_calc = average_generation_time estimated_wait_sec = calculate_wait_time(gen_time_calc, proompters_in_queue, opts.concurrent_gens, active_gen_workers) elif opts.average_generation_time_mode == 'minute': average_generation_time = calculate_avg_gen_time() gen_time_calc = average_generation_time estimated_wait_sec = ((gen_time_calc * proompters_in_queue) / opts.concurrent_gens) + (active_gen_workers * gen_time_calc) else: raise Exception if opts.netdata_root: netdata_stats = {} power_states = get_power_states() for gpu, power_state in power_states.items(): netdata_stats[gpu] = { 'power_state': power_state, # 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu'))) } else: netdata_stats = {} output = { 'stats': { 'proompters': { '1_min': SemaphoreCheckerThread.proompters_1_min, '24_hrs': get_distinct_ips_24h(), }, 'proompts_total': get_total_proompts() if opts.show_num_prompts else None, 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, 'average_generation_elapsed_sec': int(gen_time_calc), 'estimated_avg_tps': estimated_avg_tps, 'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None, 'nvidia': netdata_stats }, 'online': online, 'endpoints': { 'blocking': f'https://{opts.base_client_api}', 'streaming': f'wss://{opts.base_client_api}/v1/stream' if opts.enable_streaming else None, }, 'queue': { 'processing': active_gen_workers, 'queued': proompters_in_queue, 'estimated_wait_sec': int(estimated_wait_sec), }, 'timestamp': int(time.time()), 'config': { 'gatekeeper': 'none' if opts.auth_required is False else 'token', 'context_size': opts.context_size, 'concurrent': opts.concurrent_gens, 'model': opts.manual_model_name if opts.manual_model_name else model_name, 'mode': opts.mode, 'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip, }, 'keys': { 'openaiKeys': '∞', 'anthropicKeys': '∞', }, 'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None, } return deep_sort(output)