This repository has been archived on 2024-10-27. You can view files and clone it, but cannot push or open issues or pull requests.
local-llm-server/llm_server/routes/v1/generate_stats.py

72 lines
2.9 KiB
Python

import time
from datetime import datetime
from llm_server import opts
from llm_server.database import sum_column
from llm_server.helpers import deep_sort
from llm_server.llm.info import get_running_model
from llm_server.routes.cache import redis
from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
# TODO: have routes/__init__.py point to the latest API version generate_stats()
def generate_stats():
model_name, error = get_running_model() # will return False when the fetch fails
if isinstance(model_name, bool):
online = False
else:
online = True
opts.running_model = model_name
# t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change
# if len(t) == 0:
# estimated_wait = 0
# else:
# waits = [elapsed for end, elapsed in t]
# estimated_wait = int(sum(waits) / len(waits))
proompters_in_queue = len(priority_queue) + get_active_gen_workers()
average_tps = float(redis.get('average_tps'))
if opts.average_generation_time_mode == 'database':
average_generation_time = int(float(redis.get('average_generation_elapsed_sec')))
average_output_tokens = int(float(redis.get('average_output_tokens')))
estimated_wait_sec = int(((average_output_tokens / average_tps) * proompters_in_queue) / opts.concurrent_gens)
elif opts.average_generation_time_mode == 'minute':
average_generation_time = int(calculate_avg_gen_time())
estimated_wait_sec = int((average_generation_time * proompters_in_queue) / opts.concurrent_gens)
else:
raise Exception
output = {
'stats': {
'proompts_in_queue': proompters_in_queue,
'proompters_1_min': SemaphoreCheckerThread.proompters_1_min,
'proompts': get_total_proompts() if opts.show_num_prompts else None,
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
'average_generation_elapsed_sec': average_generation_time,
'average_tps': average_tps,
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
},
'online': online,
'endpoints': {
'blocking': opts.full_client_api,
},
'estimated_wait_sec': estimated_wait_sec,
'timestamp': int(time.time()),
'config': {
'gatekeeper': 'none' if opts.auth_required is False else 'token',
'context_size': opts.context_size,
'queue_size': opts.concurrent_gens,
'model': model_name,
'mode': opts.mode,
},
'keys': {
'openaiKeys': '',
'anthropicKeys': '',
},
}
return deep_sort(output)