diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index 16002e4..6149a70 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -30,12 +30,12 @@ def generate_stats(): active_gen_workers = get_active_gen_workers() proompters_in_queue = len(priority_queue) - average_tps = float(redis.get('average_tps')) + estimated_avg_tps = float(redis.get('estimated_avg_tps')) if opts.average_generation_time_mode == 'database': average_generation_time = float(redis.get('average_generation_elapsed_sec')) # average_output_tokens = float(redis.get('average_output_tokens')) - # average_generation_time_from_tps = (average_output_tokens / average_tps) + # average_generation_time_from_tps = (average_output_tokens / estimated_avg_tps) # What to use in our math that calculates the wait time. # We could use the average TPS but we don't know the exact TPS value, only @@ -46,7 +46,7 @@ def generate_stats(): (gen_time_calc * proompters_in_queue) / opts.concurrent_gens # Calculate wait time for items in queue ) + ( active_gen_workers * gen_time_calc # Calculate wait time for in-process items - ) if average_tps > 0 else 0 + ) if estimated_avg_tps > 0 else 0 elif opts.average_generation_time_mode == 'minute': average_generation_time = calculate_avg_gen_time() gen_time_calc = average_generation_time @@ -75,7 +75,7 @@ def generate_stats(): 'proompts_total': get_total_proompts() if opts.show_num_prompts else None, 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, 'average_generation_elapsed_sec': int(gen_time_calc), - 'average_tps': average_tps, + 'estimated_avg_tps': estimated_avg_tps, 'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None, 'nvidia': netdata_stats }, diff --git a/llm_server/threads.py b/llm_server/threads.py index f2a51b3..baf3da5 100644 --- a/llm_server/threads.py +++ b/llm_server/threads.py @@ -17,7 +17,7 @@ class MainBackgroundThread(Thread): Thread.__init__(self) self.daemon = True redis.set('average_generation_elapsed_sec', 0) - redis.set('average_tps', 0) + redis.set('estimated_avg_tps', 0) redis.set('average_output_tokens', 0) redis.set('backend_online', 0) redis.set_dict('backend_info', {}) @@ -66,6 +66,6 @@ class MainBackgroundThread(Thread): # print(f'Weighted: {average_output_tokens}, overall: {overall}') # Avoid division by zero - average_tps = round(average_output_tokens / average_generation_elapsed_sec, 2) if average_generation_elapsed_sec > 0 else 0 - redis.set('average_tps', average_tps) + estimated_avg_tps = round(average_output_tokens / average_generation_elapsed_sec, 2) if average_generation_elapsed_sec > 0 else 0 + redis.set('estimated_avg_tps', estimated_avg_tps) time.sleep(60)