active gen workers wait

This commit is contained in:
Cyberes 2023-09-23 21:17:13 -06:00
parent 7ee2311183
commit fab7b7ccdd
2 changed files with 7 additions and 7 deletions

View File

@ -12,10 +12,9 @@ from llm_server.routes.stats import calculate_avg_gen_time, get_active_gen_worke
def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers): def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
workers_running = gen_time_calc if active_gen_workers > 0 else 0 if active_gen_workers < concurrent_gens:
if proompters_in_queue < concurrent_gens:
return 0 return 0
elif proompters_in_queue >= concurrent_gens: elif active_gen_workers >= concurrent_gens:
# Calculate how long it will take to complete the currently running gens and the queued requests. # Calculate how long it will take to complete the currently running gens and the queued requests.
# If the proompters in the queue are equal to the number of workers, just use the calculated generation time. # If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
# Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round # Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
@ -25,7 +24,7 @@ def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, act
# Regardless, this is the most accurate estimate we can get without tracking worker elapsed times. # Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \ proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc) else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
return proompters_in_queue_wait_time + workers_running return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0
elif proompters_in_queue == 0 and active_gen_workers == 0: elif proompters_in_queue == 0 and active_gen_workers == 0:
# No queue, no workers # No queue, no workers
return 0 return 0

View File

@ -2,6 +2,7 @@ import os
import sys import sys
from pathlib import Path from pathlib import Path
from threading import Thread from threading import Thread
import simplejson as json import simplejson as json
from flask import Flask, jsonify, render_template, request from flask import Flask, jsonify, render_template, request
@ -28,7 +29,7 @@ from llm_server.helpers import resolve_path
from llm_server.llm.vllm.info import vllm_info from llm_server.llm.vllm.info import vllm_info
from llm_server.routes.cache import cache, redis from llm_server.routes.cache import cache, redis
from llm_server.routes.queue import start_workers from llm_server.routes.queue import start_workers
from llm_server.routes.stats import SemaphoreCheckerThread, process_avg_gen_time from llm_server.routes.stats import SemaphoreCheckerThread, get_active_gen_workers, process_avg_gen_time
from llm_server.routes.v1 import bp from llm_server.routes.v1 import bp
from llm_server.routes.v1.generate_stats import generate_stats from llm_server.routes.v1.generate_stats import generate_stats
from llm_server.stream import init_socketio from llm_server.stream import init_socketio
@ -97,7 +98,6 @@ if config['average_generation_time_mode'] not in ['database', 'minute']:
sys.exit(1) sys.exit(1)
opts.average_generation_time_mode = config['average_generation_time_mode'] opts.average_generation_time_mode = config['average_generation_time_mode']
if opts.mode == 'oobabooga': if opts.mode == 'oobabooga':
raise NotImplementedError raise NotImplementedError
# llm_server.llm.tokenizer = OobaboogaBackend() # llm_server.llm.tokenizer = OobaboogaBackend()
@ -142,7 +142,8 @@ def home():
else: else:
running_model = opts.running_model running_model = opts.running_model
if stats['queue']['queued'] == 0 and stats['queue']['processing'] > 0: active_gen_workers = get_active_gen_workers()
if stats['queue']['queued'] == 0 and active_gen_workers >= opts.concurrent_gens:
# There will be a wait if the queue is empty but prompts are processing, but we don't # There will be a wait if the queue is empty but prompts are processing, but we don't
# know how long. # know how long.
estimated_wait_sec = f"less than {stats['stats']['average_generation_elapsed_sec']} seconds" estimated_wait_sec = f"less than {stats['stats']['average_generation_elapsed_sec']} seconds"