local-llm-server/llm_server/routes/v1/generate_stats.py

import time
from datetime import datetime

from llm_server import opts
from llm_server.cluster.backend import get_a_cluster_backend
from llm_server.cluster.cluster_config import cluster_config
from llm_server.cluster.model_choices import get_model_choices
from llm_server.custom_redis import redis
from llm_server.database.database import get_distinct_ips_24h, sum_column
from llm_server.helpers import deep_sort
from llm_server.routes.stats import get_total_proompts, server_start_time


def generate_stats(regen: bool = False):
    if not regen:
        c = redis.getp('proxy_stats')
        if c:
            return c

    default_backend_url = get_a_cluster_backend()
    default_backend_info = cluster_config.get_backend(default_backend_url)
    if not default_backend_info.get('mode'):
        return
    base_client_api = redis.get('base_client_api', dtype=str)
    proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf'))

    output = {
        'default': {
            'model': default_backend_info['model'],
            'backend': default_backend_url,
        },
        'stats': {
            'proompters': {
                '5_min': proompters_5_min,
                '24_hrs': get_distinct_ips_24h(),
            },
            'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
            'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
            # 'estimated_avg_tps': estimated_avg_tps,
            'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
            'num_backends': len(cluster_config.all()) if opts.show_backends else None,
        },
        'endpoints': {
            'blocking': f'https://{base_client_api}',
            'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
        },
        'timestamp': int(time.time()),
        'config': {
            'gatekeeper': 'none' if opts.auth_required is False else 'token',
            'concurrent': opts.concurrent_gens,
            'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,
        },
        'keys': {
            'openaiKeys': '∞',
            'anthropicKeys': '∞',
        },
        'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
    }

    # TODO: have get_model_choices() return all the info so we don't have to loop over the backends ourself

    if opts.show_backends:
        for backend_url, v in cluster_config.all().items():
            backend_info = cluster_config.get_backend(backend_url)
            if not backend_info['online']:
                continue
            backend_uptime = int((datetime.now() - datetime.fromtimestamp(backend_info['startup_time'])).total_seconds()) if opts.show_uptime else None
            output['backend_info'][backend_info['hash']] = {
                'uptime': backend_uptime,
                'max_tokens': backend_info['model_config']['max_position_embeddings'],
                'model': backend_info['model'],
                'mode': backend_info['mode'],
                'nvidia': backend_info['nvidia'],
            }
    else:
        output['backend_info'] = {}

    output['default'] = get_model_choices(regen=True)[1]

    result = deep_sort(output)

    # It may take a bit to get the base client API, so don't cache until then.
    if base_client_api:
        redis.setp('proxy_stats', result)

    return result