42 lines
1.9 KiB
Python
42 lines
1.9 KiB
Python
from datetime import datetime
|
|
|
|
from llm_server.custom_redis import redis
|
|
from llm_server.helpers import round_up_base
|
|
|
|
server_start_time = datetime.now()
|
|
|
|
|
|
def get_total_proompts():
|
|
count = redis.get('proompts')
|
|
if count is None:
|
|
count = 0
|
|
else:
|
|
count = int(count)
|
|
return count
|
|
|
|
|
|
def get_active_gen_workers_model(selected_model: str = None):
|
|
return redis.get(f'active_gen_workers:{selected_model}', dtype=int, default=0)
|
|
|
|
|
|
def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
|
|
if active_gen_workers < concurrent_gens:
|
|
return 0
|
|
elif active_gen_workers >= concurrent_gens:
|
|
# Calculate how long it will take to complete the currently running gens and the queued requests.
|
|
# If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
|
|
# Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
|
|
# that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally,
|
|
# Add gen_time_calc to the time to account for the currently running generations.
|
|
# This assumes that all active workers will finish at the same time, which is unlikely.
|
|
# Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
|
|
proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
|
|
else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
|
|
return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0
|
|
elif proompters_in_queue == 0 and active_gen_workers == 0:
|
|
# No queue, no workers
|
|
return 0
|
|
else:
|
|
# No queue
|
|
return gen_time_calc
|