This repository has been archived on 2024-10-27. You can view files and clone it, but cannot push or open issues or pull requests.
local-llm-server/llm_server/routes/v1/generate_stats.py

88 lines
3.5 KiB
Python

import time
from datetime import datetime
from llm_server import opts
from llm_server.cluster.backend import get_a_cluster_backend
from llm_server.cluster.cluster_config import cluster_config
from llm_server.cluster.model_choices import get_model_choices
from llm_server.custom_redis import redis
from llm_server.database.database import get_distinct_ips_24h, sum_column
from llm_server.helpers import deep_sort
from llm_server.routes.stats import get_total_proompts, server_start_time
def generate_stats(regen: bool = False):
if not regen:
c = redis.getp('proxy_stats')
if c:
return c
default_backend_url = get_a_cluster_backend()
default_backend_info = cluster_config.get_backend(default_backend_url)
if not default_backend_info.get('mode'):
return
base_client_api = redis.get('base_client_api', dtype=str)
proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf'))
output = {
'default': {
'model': default_backend_info['model'],
'backend': default_backend_url,
},
'stats': {
'proompters': {
'5_min': proompters_5_min,
'24_hrs': get_distinct_ips_24h(),
},
'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
# 'estimated_avg_tps': estimated_avg_tps,
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
'num_backends': len(cluster_config.all()) if opts.show_backends else None,
},
'endpoints': {
'blocking': f'https://{base_client_api}',
'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
},
'timestamp': int(time.time()),
'config': {
'gatekeeper': 'none' if opts.auth_required is False else 'token',
'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,
'api_mode': opts.frontend_api_mode
},
'keys': {
'openaiKeys': '',
'anthropicKeys': '',
},
'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
}
# TODO: have get_model_choices() return all the info so we don't have to loop over the backends ourself
if opts.show_backends:
for backend_url, v in cluster_config.all().items():
backend_info = cluster_config.get_backend(backend_url)
if not backend_info['online']:
continue
backend_uptime = int((datetime.now() - datetime.fromtimestamp(backend_info['startup_time'])).total_seconds()) if opts.show_uptime else None
output['backend_info'][backend_info['hash']] = {
'uptime': backend_uptime,
'max_tokens': backend_info['model_config']['max_position_embeddings'],
'model': backend_info['model'],
'mode': backend_info['mode'],
'nvidia': backend_info['nvidia'],
'priority': backend_info['priority'],
}
else:
output['backend_info'] = {}
output['default_model'] = get_model_choices(regen=True)[1]
result = deep_sort(output)
# It may take a bit to get the base client API, so don't cache until then.
if base_client_api:
redis.setp('proxy_stats', result)
return result