diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index 1a28df7..782752e 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -5,7 +5,7 @@ from llm_server import opts from llm_server.database import get_distinct_ips_24h, sum_column from llm_server.helpers import deep_sort from llm_server.llm.info import get_running_model -from llm_server.netdata import get_gpu_wh, get_power_states +from llm_server.netdata import get_power_states from llm_server.routes.cache import redis from llm_server.routes.queue import priority_queue from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time @@ -28,16 +28,29 @@ def generate_stats(): # waits = [elapsed for end, elapsed in t] # estimated_wait = int(sum(waits) / len(waits)) + active_gen_workers = get_active_gen_workers() proompters_in_queue = len(priority_queue) average_tps = float(redis.get('average_tps')) if opts.average_generation_time_mode == 'database': - average_generation_time = int(float(redis.get('average_generation_elapsed_sec'))) - average_output_tokens = int(float(redis.get('average_output_tokens'))) - estimated_wait_sec = int(((average_output_tokens / average_tps) * proompters_in_queue) / opts.concurrent_gens) if average_tps > 0 else 0 + average_generation_time = float(redis.get('average_generation_elapsed_sec')) + average_output_tokens = float(redis.get('average_output_tokens')) + # average_generation_time_from_tps = (average_output_tokens / average_tps) + + # What to use in our math that calculates the wait time. + # We could use the average TPS but we don't know the exact TPS value, only + # the backend knows that. So, let's just stick with the elapsed time. + gen_time_calc = average_generation_time + + estimated_wait_sec = ( + (gen_time_calc * proompters_in_queue) / opts.concurrent_gens # Calculate wait time for items in queue + ) + ( + active_gen_workers * gen_time_calc # Calculate wait time for in-process items + ) if average_tps > 0 else 0 elif opts.average_generation_time_mode == 'minute': - average_generation_time = int(calculate_avg_gen_time()) - estimated_wait_sec = int((average_generation_time * proompters_in_queue) / opts.concurrent_gens) + average_generation_time = calculate_avg_gen_time() + gen_time_calc = average_generation_time + estimated_wait_sec = ((gen_time_calc * proompters_in_queue) / opts.concurrent_gens) + (active_gen_workers * gen_time_calc) else: raise Exception @@ -60,12 +73,12 @@ def generate_stats(): '24_hrs': get_distinct_ips_24h(), }, 'proompts': { - 'processing': get_active_gen_workers(), + 'processing': active_gen_workers, 'queued': proompters_in_queue, 'total': get_total_proompts() if opts.show_num_prompts else None, }, 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, - 'average_generation_elapsed_sec': average_generation_time, + 'average_generation_elapsed_sec': int(gen_time_calc), 'average_tps': average_tps, 'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None, 'nvidia': netdata_stats @@ -74,7 +87,7 @@ def generate_stats(): 'endpoints': { 'blocking': opts.full_client_api, }, - 'estimated_wait_sec': estimated_wait_sec, + 'estimated_wait_sec': int(estimated_wait_sec), 'timestamp': int(time.time()), 'config': { 'gatekeeper': 'none' if opts.auth_required is False else 'token',