local-llm-server/llm_server/routes/stats.py

from datetime import datetime

from llm_server.custom_redis import redis
from llm_server.helpers import round_up_base

server_start_time = datetime.now()


def get_total_proompts():
    count = redis.get('proompts')
    if count is None:
        count = 0
    else:
        count = int(count)
    return count


def get_active_gen_workers_model(selected_model: str = None):
    return redis.get(f'active_gen_workers:{selected_model}', dtype=int, default=0)


def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
    if active_gen_workers < concurrent_gens:
        return 0
    elif active_gen_workers >= concurrent_gens:
        # Calculate how long it will take to complete the currently running gens and the queued requests.
        # If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
        # Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
        # that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally,
        # Add gen_time_calc to the time to account for the currently running generations.
        # This assumes that all active workers will finish at the same time, which is unlikely.
        # Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
        proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
            else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
        return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0
    elif proompters_in_queue == 0 and active_gen_workers == 0:
        # No queue, no workers
        return 0
    else:
        # No queue
        return gen_time_calc