adjust estimated wait time calculations
This commit is contained in:
parent
441a870e85
commit
f43336c92c
|
@ -5,7 +5,7 @@ from llm_server import opts
|
||||||
from llm_server.database import get_distinct_ips_24h, sum_column
|
from llm_server.database import get_distinct_ips_24h, sum_column
|
||||||
from llm_server.helpers import deep_sort
|
from llm_server.helpers import deep_sort
|
||||||
from llm_server.llm.info import get_running_model
|
from llm_server.llm.info import get_running_model
|
||||||
from llm_server.netdata import get_gpu_wh, get_power_states
|
from llm_server.netdata import get_power_states
|
||||||
from llm_server.routes.cache import redis
|
from llm_server.routes.cache import redis
|
||||||
from llm_server.routes.queue import priority_queue
|
from llm_server.routes.queue import priority_queue
|
||||||
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
|
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
|
||||||
|
@ -28,16 +28,29 @@ def generate_stats():
|
||||||
# waits = [elapsed for end, elapsed in t]
|
# waits = [elapsed for end, elapsed in t]
|
||||||
# estimated_wait = int(sum(waits) / len(waits))
|
# estimated_wait = int(sum(waits) / len(waits))
|
||||||
|
|
||||||
|
active_gen_workers = get_active_gen_workers()
|
||||||
proompters_in_queue = len(priority_queue)
|
proompters_in_queue = len(priority_queue)
|
||||||
average_tps = float(redis.get('average_tps'))
|
average_tps = float(redis.get('average_tps'))
|
||||||
|
|
||||||
if opts.average_generation_time_mode == 'database':
|
if opts.average_generation_time_mode == 'database':
|
||||||
average_generation_time = int(float(redis.get('average_generation_elapsed_sec')))
|
average_generation_time = float(redis.get('average_generation_elapsed_sec'))
|
||||||
average_output_tokens = int(float(redis.get('average_output_tokens')))
|
average_output_tokens = float(redis.get('average_output_tokens'))
|
||||||
estimated_wait_sec = int(((average_output_tokens / average_tps) * proompters_in_queue) / opts.concurrent_gens) if average_tps > 0 else 0
|
# average_generation_time_from_tps = (average_output_tokens / average_tps)
|
||||||
|
|
||||||
|
# What to use in our math that calculates the wait time.
|
||||||
|
# We could use the average TPS but we don't know the exact TPS value, only
|
||||||
|
# the backend knows that. So, let's just stick with the elapsed time.
|
||||||
|
gen_time_calc = average_generation_time
|
||||||
|
|
||||||
|
estimated_wait_sec = (
|
||||||
|
(gen_time_calc * proompters_in_queue) / opts.concurrent_gens # Calculate wait time for items in queue
|
||||||
|
) + (
|
||||||
|
active_gen_workers * gen_time_calc # Calculate wait time for in-process items
|
||||||
|
) if average_tps > 0 else 0
|
||||||
elif opts.average_generation_time_mode == 'minute':
|
elif opts.average_generation_time_mode == 'minute':
|
||||||
average_generation_time = int(calculate_avg_gen_time())
|
average_generation_time = calculate_avg_gen_time()
|
||||||
estimated_wait_sec = int((average_generation_time * proompters_in_queue) / opts.concurrent_gens)
|
gen_time_calc = average_generation_time
|
||||||
|
estimated_wait_sec = ((gen_time_calc * proompters_in_queue) / opts.concurrent_gens) + (active_gen_workers * gen_time_calc)
|
||||||
else:
|
else:
|
||||||
raise Exception
|
raise Exception
|
||||||
|
|
||||||
|
@ -60,12 +73,12 @@ def generate_stats():
|
||||||
'24_hrs': get_distinct_ips_24h(),
|
'24_hrs': get_distinct_ips_24h(),
|
||||||
},
|
},
|
||||||
'proompts': {
|
'proompts': {
|
||||||
'processing': get_active_gen_workers(),
|
'processing': active_gen_workers,
|
||||||
'queued': proompters_in_queue,
|
'queued': proompters_in_queue,
|
||||||
'total': get_total_proompts() if opts.show_num_prompts else None,
|
'total': get_total_proompts() if opts.show_num_prompts else None,
|
||||||
},
|
},
|
||||||
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
|
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
|
||||||
'average_generation_elapsed_sec': average_generation_time,
|
'average_generation_elapsed_sec': int(gen_time_calc),
|
||||||
'average_tps': average_tps,
|
'average_tps': average_tps,
|
||||||
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
|
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
|
||||||
'nvidia': netdata_stats
|
'nvidia': netdata_stats
|
||||||
|
@ -74,7 +87,7 @@ def generate_stats():
|
||||||
'endpoints': {
|
'endpoints': {
|
||||||
'blocking': opts.full_client_api,
|
'blocking': opts.full_client_api,
|
||||||
},
|
},
|
||||||
'estimated_wait_sec': estimated_wait_sec,
|
'estimated_wait_sec': int(estimated_wait_sec),
|
||||||
'timestamp': int(time.time()),
|
'timestamp': int(time.time()),
|
||||||
'config': {
|
'config': {
|
||||||
'gatekeeper': 'none' if opts.auth_required is False else 'token',
|
'gatekeeper': 'none' if opts.auth_required is False else 'token',
|
||||||
|
|
Reference in New Issue