local-llm-server/llm_server/routes/v1/generate_stats.py

import time
from datetime import datetime

from llm_server import opts
from llm_server.database.database import get_distinct_ips_24h, sum_column
from llm_server.helpers import deep_sort, round_up_base
from llm_server.llm.info import get_running_model
from llm_server.netdata import get_power_states
from llm_server.routes.cache import redis
from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import get_active_gen_workers, get_total_proompts, server_start_time


def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
    if active_gen_workers < concurrent_gens:
        return 0
    elif active_gen_workers >= concurrent_gens:
        # Calculate how long it will take to complete the currently running gens and the queued requests.
        # If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
        # Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
        # that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally,
        # Add gen_time_calc to the time to account for the currently running generations.
        # This assumes that all active workers will finish at the same time, which is unlikely.
        # Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
        proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
            else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
        return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0
    elif proompters_in_queue == 0 and active_gen_workers == 0:
        # No queue, no workers
        return 0
    else:
        # No queue
        return gen_time_calc


# TODO: have routes/__init__.py point to the latest API version generate_stats()

def generate_stats(regen: bool = False):
    if not regen:
        c = redis.get('proxy_stats', dict)
        if c:
            return c

    model_name, error = get_running_model()  # will return False when the fetch fails
    if isinstance(model_name, bool):
        online = False
    else:
        online = True
        redis.set('running_model', model_name)

    # t = elapsed_times.copy()  # copy since we do multiple operations and don't want it to change
    # if len(t) == 0:
    #     estimated_wait = 0
    # else:
    #     waits = [elapsed for end, elapsed in t]
    #     estimated_wait = int(sum(waits) / len(waits))

    active_gen_workers = get_active_gen_workers()
    proompters_in_queue = len(priority_queue)

    # This is so wildly inaccurate it's disabled until I implement stats reporting into VLLM.
    # estimated_avg_tps = redis.get('estimated_avg_tps', float, default=0)

    average_generation_time = redis.get('average_generation_elapsed_sec', float, default=0)
    estimated_wait_sec = calculate_wait_time(average_generation_time, proompters_in_queue, opts.concurrent_gens, active_gen_workers)

    if opts.netdata_root:
        netdata_stats = {}
        power_states = get_power_states()
        for gpu, power_state in power_states.items():
            netdata_stats[gpu] = {
                'power_state': power_state,
                # 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))
            }
    else:
        netdata_stats = {}

    base_client_api = redis.get('base_client_api', str)
    proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf'))

    output = {
        'stats': {
            'proompters': {
                '5_min': proompters_5_min,
                '24_hrs': get_distinct_ips_24h(),
            },
            'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
            'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
            'average_generation_elapsed_sec': int(average_generation_time),
            # 'estimated_avg_tps': estimated_avg_tps,
            'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
        },
        'online': online,
        'endpoints': {
            'blocking': f'https://{base_client_api}',
            'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
        },
        'queue': {
            'processing': active_gen_workers,
            'queued': proompters_in_queue,
            'estimated_wait_sec': int(estimated_wait_sec),
        },
        'timestamp': int(time.time()),
        'config': {
            'gatekeeper': 'none' if opts.auth_required is False else 'token',
            'context_size': opts.context_size,
            'concurrent': opts.concurrent_gens,
            'model': opts.manual_model_name if opts.manual_model_name else model_name,
            'mode': opts.mode,
            'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,
        },
        'keys': {
            'openaiKeys': '∞',
            'anthropicKeys': '∞',
        },
        'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
        'nvidia': netdata_stats
    }
    result = deep_sort(output)

    # It may take a bit to get the base client API, so don't cache until then.
    if base_client_api:
        redis.set_dict('proxy_stats', result)  # Cache with no expiry
    return result
add home template 2023-08-23 23:11:12 -06:00			`import time`
update readme 2023-08-24 12:19:59 -06:00			`from datetime import datetime`
add home template 2023-08-23 23:11:12 -06:00
			`from llm_server import opts`
port to mysql, use vllm tokenizer endpoint 2023-09-20 20:30:31 -06:00			`from llm_server.database.database import get_distinct_ips_24h, sum_column`
calculate estimateed wate time better 2023-09-17 18:33:57 -06:00			`from llm_server.helpers import deep_sort, round_up_base`
add home template 2023-08-23 23:11:12 -06:00			`from llm_server.llm.info import get_running_model`
adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`from llm_server.netdata import get_power_states`
fix homepage slowness, fix incorrect 24 hr prompters, fix redis wrapper, 2023-09-25 17:20:21 -06:00			`from llm_server.routes.cache import redis`
add home template 2023-08-23 23:11:12 -06:00			`from llm_server.routes.queue import priority_queue`
clean up background threads 2023-09-27 19:39:04 -06:00			`from llm_server.routes.stats import get_active_gen_workers, get_total_proompts, server_start_time`
add home template 2023-08-23 23:11:12 -06:00

calculate estimateed wate time better 2023-09-17 18:33:57 -06:00			`def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):`
active gen workers wait 2023-09-23 21:17:13 -06:00			`if active_gen_workers < concurrent_gens:`
if there's less than num concurrent wait time is 0 2023-09-23 21:09:21 -06:00			`return 0`
active gen workers wait 2023-09-23 21:17:13 -06:00			`elif active_gen_workers >= concurrent_gens:`
calculate estimateed wate time better 2023-09-17 18:33:57 -06:00			`# Calculate how long it will take to complete the currently running gens and the queued requests.`
			`# If the proompters in the queue are equal to the number of workers, just use the calculated generation time.`
			`# Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round`
			`# that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally,`
			`# Add gen_time_calc to the time to account for the currently running generations.`
			`# This assumes that all active workers will finish at the same time, which is unlikely.`
			`# Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.`
			`proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \`
blushes oopsie daisy 2023-09-17 20:22:17 -06:00			`else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)`
active gen workers wait 2023-09-23 21:17:13 -06:00			`return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0`
calculate estimateed wate time better 2023-09-17 18:33:57 -06:00			`elif proompters_in_queue == 0 and active_gen_workers == 0:`
			`# No queue, no workers`
			`return 0`
			`else:`
			`# No queue`
			`return gen_time_calc`


show total output tokens on stats 2023-08-24 20:43:11 -06:00			`# TODO: have routes/__init__.py point to the latest API version generate_stats()`

fix homepage slowness, fix incorrect 24 hr prompters, fix redis wrapper, 2023-09-25 17:20:21 -06:00			`def generate_stats(regen: bool = False):`
			`if not regen:`
			`c = redis.get('proxy_stats', dict)`
			`if c:`
			`return c`

update current model when we generate_stats() 2023-08-24 21:10:00 -06:00			`model_name, error = get_running_model() # will return False when the fetch fails`
			`if isinstance(model_name, bool):`
add home template 2023-08-23 23:11:12 -06:00			`online = False`
			`else:`
			`online = True`
convert to gunicorn 2023-09-26 13:32:33 -06:00			`redis.set('running_model', model_name)`
add home template 2023-08-23 23:11:12 -06:00
			`# t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change`
			`# if len(t) == 0:`
			`# estimated_wait = 0`
			`# else:`
			`# waits = [elapsed for end, elapsed in t]`
			`# estimated_wait = int(sum(waits) / len(waits))`

adjust estimated wait time calculations 2023-08-27 22:17:21 -06:00			`active_gen_workers = get_active_gen_workers()`
reorganize stats, add 24 hr proompters, adjust logging when error 2023-08-25 12:20:16 -06:00			`proompters_in_queue = len(priority_queue)`
set inference workers to daemon, add finally to inference worker, hide estimated avg tps 2023-09-27 18:36:51 -06:00
			`# This is so wildly inaccurate it's disabled until I implement stats reporting into VLLM.`
			`# estimated_avg_tps = redis.get('estimated_avg_tps', float, default=0)`
update home, update readme, calculate estimated wait based on database stats 2023-08-24 16:47:14 -06:00
clean up background threads 2023-09-27 19:39:04 -06:00			`average_generation_time = redis.get('average_generation_elapsed_sec', float, default=0)`
			`estimated_wait_sec = calculate_wait_time(average_generation_time, proompters_in_queue, opts.concurrent_gens, active_gen_workers)`
update readme 2023-08-24 12:19:59 -06:00
track nvidia power states through netdata 2023-08-24 21:36:00 -06:00			`if opts.netdata_root:`
reorganize nvidia stats 2023-08-25 15:02:40 -06:00			`netdata_stats = {}`
			`power_states = get_power_states()`
			`for gpu, power_state in power_states.items():`
			`netdata_stats[gpu] = {`
			`'power_state': power_state,`
			`# 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))`
			`}`
track nvidia power states through netdata 2023-08-24 21:36:00 -06:00			`else:`
			`netdata_stats = {}`

fix homepage slowness, fix incorrect 24 hr prompters, fix redis wrapper, 2023-09-25 17:20:21 -06:00			`base_client_api = redis.get('base_client_api', str)`
rewrite redis usage 2023-09-28 03:44:30 -06:00			`proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf'))`
fix recent proompters to work with gunicorn 2023-09-17 19:06:53 -06:00
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`output = {`
add home template 2023-08-23 23:11:12 -06:00			`'stats': {`
reorganize stats, add 24 hr proompters, adjust logging when error 2023-08-25 12:20:16 -06:00			`'proompters': {`
change proompters 1 min to 5 min 2023-09-20 21:21:22 -06:00			`'5_min': proompters_5_min,`
reorganize stats, add 24 hr proompters, adjust logging when error 2023-08-25 12:20:16 -06:00			`'24_hrs': get_distinct_ips_24h(),`
			`},`
reorganize stats page again 2023-08-27 22:24:44 -06:00			`'proompts_total': get_total_proompts() if opts.show_num_prompts else None,`
add home template 2023-08-23 23:11:12 -06:00			`'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,`
clean up background threads 2023-09-27 19:39:04 -06:00			`'average_generation_elapsed_sec': int(average_generation_time),`
set inference workers to daemon, add finally to inference worker, hide estimated avg tps 2023-09-27 18:36:51 -06:00			`# 'estimated_avg_tps': estimated_avg_tps,`
show total output tokens on stats 2023-08-24 20:43:11 -06:00			`'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,`
add home template 2023-08-23 23:11:12 -06:00			`},`
			`'online': online,`
			`'endpoints': {`
cache stats in background 2023-09-17 18:55:36 -06:00			`'blocking': f'https://{base_client_api}',`
			`'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,`
add home template 2023-08-23 23:11:12 -06:00			`},`
reorganize stats page again 2023-08-27 22:24:44 -06:00			`'queue': {`
			`'processing': active_gen_workers,`
			`'queued': proompters_in_queue,`
			`'estimated_wait_sec': int(estimated_wait_sec),`
			`},`
add home template 2023-08-23 23:11:12 -06:00			`'timestamp': int(time.time()),`
			`'config': {`
			`'gatekeeper': 'none' if opts.auth_required is False else 'token',`
			`'context_size': opts.context_size,`
add HF text-generation-inference backend 2023-08-29 13:46:41 -06:00			`'concurrent': opts.concurrent_gens,`
fix invalid param error, add manual model name 2023-09-12 10:30:45 -06:00			`'model': opts.manual_model_name if opts.manual_model_name else model_name,`
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`'mode': opts.mode,`
implement vllm backend 2023-09-11 20:47:19 -06:00			`'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,`
sort keys of stats dict 2023-08-24 18:59:52 -06:00			`},`
			`'keys': {`
			`'openaiKeys': '∞',`
			`'anthropicKeys': '∞',`
			`},`
add HF text-generation-inference backend 2023-08-29 13:46:41 -06:00			`'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,`
avoid sending to backend to tokenize if it's greater than our specified context size 2023-09-28 03:54:20 -06:00			`'nvidia': netdata_stats`
add home template 2023-08-23 23:11:12 -06:00			`}`
fix homepage slowness, fix incorrect 24 hr prompters, fix redis wrapper, 2023-09-25 17:20:21 -06:00			`result = deep_sort(output)`

			`# It may take a bit to get the base client API, so don't cache until then.`
			`if base_client_api:`
			`redis.set_dict('proxy_stats', result) # Cache with no expiry`
			`return result`