local-llm-server/llm_server/routes/v1/generate_stats.py

import time
from datetime import datetime

from llm_server import opts
from llm_server.database import get_distinct_ips_24h, sum_column
from llm_server.helpers import deep_sort
from llm_server.llm.info import get_running_model
from llm_server.netdata import get_power_states
from llm_server.routes.cache import redis
from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time


# TODO: have routes/__init__.py point to the latest API version generate_stats()

def generate_stats():
    model_name, error = get_running_model()  # will return False when the fetch fails
    if isinstance(model_name, bool):
        online = False
    else:
        online = True
        opts.running_model = model_name

    # t = elapsed_times.copy()  # copy since we do multiple operations and don't want it to change
    # if len(t) == 0:
    #     estimated_wait = 0
    # else:
    #     waits = [elapsed for end, elapsed in t]
    #     estimated_wait = int(sum(waits) / len(waits))

    active_gen_workers = get_active_gen_workers()
    proompters_in_queue = len(priority_queue)
    average_tps = float(redis.get('average_tps'))

    if opts.average_generation_time_mode == 'database':
        average_generation_time = float(redis.get('average_generation_elapsed_sec'))
        average_output_tokens = float(redis.get('average_output_tokens'))
        # average_generation_time_from_tps = (average_output_tokens / average_tps)

        # What to use in our math that calculates the wait time.
        # We could use the average TPS but we don't know the exact TPS value, only
        # the backend knows that. So, let's just stick with the elapsed time.
        gen_time_calc = average_generation_time

        estimated_wait_sec = (
                                     (gen_time_calc * proompters_in_queue) / opts.concurrent_gens  # Calculate wait time for items in queue
                             ) + (
                                     active_gen_workers * gen_time_calc  # Calculate wait time for in-process items
                             ) if average_tps > 0 else 0
    elif opts.average_generation_time_mode == 'minute':
        average_generation_time = calculate_avg_gen_time()
        gen_time_calc = average_generation_time
        estimated_wait_sec = ((gen_time_calc * proompters_in_queue) / opts.concurrent_gens) + (active_gen_workers * gen_time_calc)
    else:
        raise Exception

    if opts.netdata_root:
        netdata_stats = {}
        power_states = get_power_states()
        for gpu, power_state in power_states.items():
            netdata_stats[gpu] = {
                'power_state': power_state,
                # 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))
            }

    else:
        netdata_stats = {}

    output = {
        'stats': {
            'proompters': {
                '1_min': SemaphoreCheckerThread.proompters_1_min,
                '24_hrs': get_distinct_ips_24h(),
            },
            'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
            'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
            'average_generation_elapsed_sec': int(gen_time_calc),
            'average_tps': average_tps,
            'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
            'nvidia': netdata_stats
        },
        'online': online,
        'endpoints': {
            'blocking': opts.full_client_api,
        },
        'queue': {
            'processing': active_gen_workers,
            'queued': proompters_in_queue,
            'estimated_wait_sec': int(estimated_wait_sec),
        },
        'timestamp': int(time.time()),
        'config': {
            'gatekeeper': 'none' if opts.auth_required is False else 'token',
            'context_size': opts.context_size,
            'queue_size': opts.concurrent_gens,
            'model': model_name,
            'mode': opts.mode,
            'simultaneous_requests': opts.ip_in_queue_max,
        },
        'keys': {
            'openaiKeys': '∞',
            'anthropicKeys': '∞',
        },
    }
    return deep_sort(output)
add home template 2023-08-23 23:11:12 -06:00			`import time`
update readme 2023-08-24 12:19:59 -06:00			`from datetime import datetime`
add home template 2023-08-23 23:11:12 -06:00
			`from llm_server import opts`
reorganize stats, add 24 hr proompters, adjust logging when error 2023-08-25 12:20:16 -06:00			`from llm_server.database import get_distinct_ips_24h, sum_column`
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`from llm_server.helpers import deep_sort`
add home template 2023-08-23 23:11:12 -06:00			`from llm_server.llm.info import get_running_model`
adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`from llm_server.netdata import get_power_states`
update home, update readme, calculate estimated wait based on database stats 2023-08-24 16:47:14 -06:00			`from llm_server.routes.cache import redis`
add home template 2023-08-23 23:11:12 -06:00			`from llm_server.routes.queue import priority_queue`
			`from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time`


show total output tokens on stats 2023-08-24 20:43:11 -06:00			`# TODO: have routes/__init__.py point to the latest API version generate_stats()`

add home template 2023-08-23 23:11:12 -06:00			`def generate_stats():`
update current model when we generate_stats() 2023-08-24 21:10:00 -06:00			`model_name, error = get_running_model() # will return False when the fetch fails`
			`if isinstance(model_name, bool):`
add home template 2023-08-23 23:11:12 -06:00			`online = False`
			`else:`
			`online = True`
update current model when we generate_stats() 2023-08-24 21:10:00 -06:00			`opts.running_model = model_name`
add home template 2023-08-23 23:11:12 -06:00
			`# t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change`
			`# if len(t) == 0:`
			`# estimated_wait = 0`
			`# else:`
			`# waits = [elapsed for end, elapsed in t]`
			`# estimated_wait = int(sum(waits) / len(waits))`

adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`active_gen_workers = get_active_gen_workers()`
reorganize stats, add 24 hr proompters, adjust logging when error 2023-08-25 12:20:16 -06:00			`proompters_in_queue = len(priority_queue)`
update home, update readme, calculate estimated wait based on database stats 2023-08-24 16:47:14 -06:00			`average_tps = float(redis.get('average_tps'))`

			`if opts.average_generation_time_mode == 'database':`
adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`average_generation_time = float(redis.get('average_generation_elapsed_sec'))`
			`average_output_tokens = float(redis.get('average_output_tokens'))`
			`# average_generation_time_from_tps = (average_output_tokens / average_tps)`

			`# What to use in our math that calculates the wait time.`
			`# We could use the average TPS but we don't know the exact TPS value, only`
			`# the backend knows that. So, let's just stick with the elapsed time.`
			`gen_time_calc = average_generation_time`

			`estimated_wait_sec = (`
			`(gen_time_calc * proompters_in_queue) / opts.concurrent_gens # Calculate wait time for items in queue`
			`) + (`
			`active_gen_workers * gen_time_calc # Calculate wait time for in-process items`
			`) if average_tps > 0 else 0`
update home, update readme, calculate estimated wait based on database stats 2023-08-24 16:47:14 -06:00			`elif opts.average_generation_time_mode == 'minute':`
adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`average_generation_time = calculate_avg_gen_time()`
			`gen_time_calc = average_generation_time`
			`estimated_wait_sec = ((gen_time_calc * proompters_in_queue) / opts.concurrent_gens) + (active_gen_workers * gen_time_calc)`
update home, update readme, calculate estimated wait based on database stats 2023-08-24 16:47:14 -06:00			`else:`
			`raise Exception`
update readme 2023-08-24 12:19:59 -06:00
track nvidia power states through netdata 2023-08-24 21:36:00 -06:00			`if opts.netdata_root:`
reorganize nvidia stats 2023-08-25 15:02:40 -06:00			`netdata_stats = {}`
			`power_states = get_power_states()`
			`for gpu, power_state in power_states.items():`
			`netdata_stats[gpu] = {`
			`'power_state': power_state,`
			`# 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))`
			`}`

track nvidia power states through netdata 2023-08-24 21:36:00 -06:00			`else:`
			`netdata_stats = {}`

sort keys of stats dict 2023-08-24 18:59:52 -06:00			`output = {`
add home template 2023-08-23 23:11:12 -06:00			`'stats': {`
reorganize stats, add 24 hr proompters, adjust logging when error 2023-08-25 12:20:16 -06:00			`'proompters': {`
			`'1_min': SemaphoreCheckerThread.proompters_1_min,`
			`'24_hrs': get_distinct_ips_24h(),`
			`},`
reorganize stats page again 2023-08-27 22:24:44 -06:00			`'proompts_total': get_total_proompts() if opts.show_num_prompts else None,`
add home template 2023-08-23 23:11:12 -06:00			`'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,`
adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`'average_generation_elapsed_sec': int(gen_time_calc),`
update home, update readme, calculate estimated wait based on database stats 2023-08-24 16:47:14 -06:00			`'average_tps': average_tps,`
show total output tokens on stats 2023-08-24 20:43:11 -06:00			`'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,`
track nvidia power states through netdata 2023-08-24 21:36:00 -06:00			`'nvidia': netdata_stats`
add home template 2023-08-23 23:11:12 -06:00			`},`
			`'online': online,`
			`'endpoints': {`
			`'blocking': opts.full_client_api,`
			`},`
reorganize stats page again 2023-08-27 22:24:44 -06:00			`'queue': {`
			`'processing': active_gen_workers,`
			`'queued': proompters_in_queue,`
			`'estimated_wait_sec': int(estimated_wait_sec),`
			`},`
add home template 2023-08-23 23:11:12 -06:00			`'timestamp': int(time.time()),`
			`'config': {`
			`'gatekeeper': 'none' if opts.auth_required is False else 'token',`
			`'context_size': opts.context_size,`
update readme 2023-08-24 12:19:59 -06:00			`'queue_size': opts.concurrent_gens,`
update current model when we generate_stats() 2023-08-24 21:10:00 -06:00			`'model': model_name,`
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`'mode': opts.mode,`
exclude tokens with priority 0 from simultaneous requests ratelimit 2023-08-28 00:03:25 -06:00			`'simultaneous_requests': opts.ip_in_queue_max,`
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`},`
			`'keys': {`
			`'openaiKeys': '∞',`
			`'anthropicKeys': '∞',`
			`},`
add home template 2023-08-23 23:11:12 -06:00			`}`
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`return deep_sort(output)`