87 lines
3.4 KiB
Python
87 lines
3.4 KiB
Python
import time
|
|
from datetime import datetime
|
|
|
|
from llm_server import opts
|
|
from llm_server.cluster.backend import get_a_cluster_backend
|
|
from llm_server.cluster.cluster_config import cluster_config
|
|
from llm_server.cluster.model_choices import get_model_choices
|
|
from llm_server.custom_redis import redis
|
|
from llm_server.database.database import get_distinct_ips_24h, sum_column
|
|
from llm_server.helpers import deep_sort
|
|
from llm_server.routes.stats import get_total_proompts, server_start_time
|
|
|
|
|
|
def generate_stats(regen: bool = False):
|
|
if not regen:
|
|
c = redis.getp('proxy_stats')
|
|
if c:
|
|
return c
|
|
|
|
default_backend_url = get_a_cluster_backend()
|
|
default_backend_info = cluster_config.get_backend(default_backend_url)
|
|
if not default_backend_info.get('mode'):
|
|
return
|
|
base_client_api = redis.get('base_client_api', dtype=str)
|
|
proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf'))
|
|
|
|
output = {
|
|
'default': {
|
|
'model': default_backend_info['model'],
|
|
'backend': default_backend_url,
|
|
},
|
|
'stats': {
|
|
'proompters': {
|
|
'5_min': proompters_5_min,
|
|
'24_hrs': get_distinct_ips_24h(),
|
|
},
|
|
'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
|
|
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
|
|
# 'estimated_avg_tps': estimated_avg_tps,
|
|
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
|
|
'num_backends': len(cluster_config.all()) if opts.show_backends else None,
|
|
},
|
|
'endpoints': {
|
|
'blocking': f'https://{base_client_api}',
|
|
'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
|
|
},
|
|
'timestamp': int(time.time()),
|
|
'config': {
|
|
'gatekeeper': 'none' if opts.auth_required is False else 'token',
|
|
'concurrent': opts.concurrent_gens,
|
|
'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,
|
|
},
|
|
'keys': {
|
|
'openaiKeys': '∞',
|
|
'anthropicKeys': '∞',
|
|
},
|
|
'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
|
|
}
|
|
|
|
# TODO: have get_model_choices() return all the info so we don't have to loop over the backends ourself
|
|
|
|
if opts.show_backends:
|
|
for backend_url, v in cluster_config.all().items():
|
|
backend_info = cluster_config.get_backend(backend_url)
|
|
if not backend_info['online']:
|
|
continue
|
|
backend_uptime = int((datetime.now() - datetime.fromtimestamp(backend_info['startup_time'])).total_seconds()) if opts.show_uptime else None
|
|
output['backend_info'][backend_info['hash']] = {
|
|
'uptime': backend_uptime,
|
|
'max_tokens': backend_info['model_config']['max_position_embeddings'],
|
|
'model': backend_info['model'],
|
|
'mode': backend_info['mode'],
|
|
'nvidia': backend_info['nvidia'],
|
|
}
|
|
else:
|
|
output['backend_info'] = {}
|
|
|
|
output['default'] = get_model_choices(regen=True)[1]
|
|
|
|
result = deep_sort(output)
|
|
|
|
# It may take a bit to get the base client API, so don't cache until then.
|
|
if base_client_api:
|
|
redis.setp('proxy_stats', result)
|
|
|
|
return result
|