import time from datetime import datetime from flask import jsonify, request from llm_server import opts from . import bp from .. import stats from ..cache import cache from ..queue import priority_queue from ..stats import SemaphoreCheckerThread, calculate_avg_gen_time from ...llm.info import get_running_model @bp.route('/stats', methods=['GET']) @cache.cached(timeout=5, query_string=True) def get_stats(): model_list, error = get_running_model() # will return False when the fetch fails if isinstance(model_list, bool): online = False else: online = True # t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change # if len(t) == 0: # estimated_wait = 0 # else: # waits = [elapsed for end, elapsed in t] # estimated_wait = int(sum(waits) / len(waits)) average_generation_time = int(calculate_avg_gen_time()) return jsonify({ 'stats': { 'prompts_in_queue': len(priority_queue), 'proompters_1_min': SemaphoreCheckerThread.proompters_1_min, 'total_proompts': stats.get_count(), 'uptime': int((datetime.now() - stats.server_start_time).total_seconds()), 'average_generation_elapsed_sec': average_generation_time, }, 'online': online, 'mode': opts.mode, 'model': model_list, 'endpoints': { 'blocking': f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}', }, 'estimated_wait_sec': int(average_generation_time * len(priority_queue)), 'timestamp': int(time.time()), 'openaiKeys': '∞', 'anthropicKeys': '∞', 'config': { 'gatekeeper': 'none' if opts.auth_required is False else 'token', 'context_size': opts.context_size, } }), 200