active gen workers wait
This commit is contained in:
parent
7ee2311183
commit
fab7b7ccdd
|
@ -12,10 +12,9 @@ from llm_server.routes.stats import calculate_avg_gen_time, get_active_gen_worke
|
||||||
|
|
||||||
|
|
||||||
def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
|
def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
|
||||||
workers_running = gen_time_calc if active_gen_workers > 0 else 0
|
if active_gen_workers < concurrent_gens:
|
||||||
if proompters_in_queue < concurrent_gens:
|
|
||||||
return 0
|
return 0
|
||||||
elif proompters_in_queue >= concurrent_gens:
|
elif active_gen_workers >= concurrent_gens:
|
||||||
# Calculate how long it will take to complete the currently running gens and the queued requests.
|
# Calculate how long it will take to complete the currently running gens and the queued requests.
|
||||||
# If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
|
# If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
|
||||||
# Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
|
# Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
|
||||||
|
@ -25,7 +24,7 @@ def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, act
|
||||||
# Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
|
# Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
|
||||||
proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
|
proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
|
||||||
else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
|
else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
|
||||||
return proompters_in_queue_wait_time + workers_running
|
return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0
|
||||||
elif proompters_in_queue == 0 and active_gen_workers == 0:
|
elif proompters_in_queue == 0 and active_gen_workers == 0:
|
||||||
# No queue, no workers
|
# No queue, no workers
|
||||||
return 0
|
return 0
|
||||||
|
|
|
@ -2,6 +2,7 @@ import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
import simplejson as json
|
import simplejson as json
|
||||||
from flask import Flask, jsonify, render_template, request
|
from flask import Flask, jsonify, render_template, request
|
||||||
|
|
||||||
|
@ -28,7 +29,7 @@ from llm_server.helpers import resolve_path
|
||||||
from llm_server.llm.vllm.info import vllm_info
|
from llm_server.llm.vllm.info import vllm_info
|
||||||
from llm_server.routes.cache import cache, redis
|
from llm_server.routes.cache import cache, redis
|
||||||
from llm_server.routes.queue import start_workers
|
from llm_server.routes.queue import start_workers
|
||||||
from llm_server.routes.stats import SemaphoreCheckerThread, process_avg_gen_time
|
from llm_server.routes.stats import SemaphoreCheckerThread, get_active_gen_workers, process_avg_gen_time
|
||||||
from llm_server.routes.v1 import bp
|
from llm_server.routes.v1 import bp
|
||||||
from llm_server.routes.v1.generate_stats import generate_stats
|
from llm_server.routes.v1.generate_stats import generate_stats
|
||||||
from llm_server.stream import init_socketio
|
from llm_server.stream import init_socketio
|
||||||
|
@ -97,7 +98,6 @@ if config['average_generation_time_mode'] not in ['database', 'minute']:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
opts.average_generation_time_mode = config['average_generation_time_mode']
|
opts.average_generation_time_mode = config['average_generation_time_mode']
|
||||||
|
|
||||||
|
|
||||||
if opts.mode == 'oobabooga':
|
if opts.mode == 'oobabooga':
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
# llm_server.llm.tokenizer = OobaboogaBackend()
|
# llm_server.llm.tokenizer = OobaboogaBackend()
|
||||||
|
@ -142,7 +142,8 @@ def home():
|
||||||
else:
|
else:
|
||||||
running_model = opts.running_model
|
running_model = opts.running_model
|
||||||
|
|
||||||
if stats['queue']['queued'] == 0 and stats['queue']['processing'] > 0:
|
active_gen_workers = get_active_gen_workers()
|
||||||
|
if stats['queue']['queued'] == 0 and active_gen_workers >= opts.concurrent_gens:
|
||||||
# There will be a wait if the queue is empty but prompts are processing, but we don't
|
# There will be a wait if the queue is empty but prompts are processing, but we don't
|
||||||
# know how long.
|
# know how long.
|
||||||
estimated_wait_sec = f"less than {stats['stats']['average_generation_elapsed_sec']} seconds"
|
estimated_wait_sec = f"less than {stats['stats']['average_generation_elapsed_sec']} seconds"
|
||||||
|
|
Reference in New Issue