from datetime import datetime from llm_server.custom_redis import redis from llm_server.helpers import round_up_base server_start_time = datetime.now() def get_total_proompts(): count = redis.get('proompts') if count is None: count = 0 else: count = int(count) return count def get_active_gen_workers_model(selected_model: str = None): return redis.get(f'active_gen_workers:{selected_model}', dtype=int, default=0) def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers): if active_gen_workers < concurrent_gens: return 0 elif active_gen_workers >= concurrent_gens: # Calculate how long it will take to complete the currently running gens and the queued requests. # If the proompters in the queue are equal to the number of workers, just use the calculated generation time. # Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round # that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally, # Add gen_time_calc to the time to account for the currently running generations. # This assumes that all active workers will finish at the same time, which is unlikely. # Regardless, this is the most accurate estimate we can get without tracking worker elapsed times. proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \ else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc) return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0 elif proompters_in_queue == 0 and active_gen_workers == 0: # No queue, no workers return 0 else: # No queue return gen_time_calc