local-llm-server/llm_server/routes/v1/generate_stats.py

106 lines
4.2 KiB
Python
Raw Normal View History

2023-08-23 23:11:12 -06:00
import time
2023-08-24 12:19:59 -06:00
from datetime import datetime
2023-08-23 23:11:12 -06:00
from llm_server import opts
from llm_server.database import get_distinct_ips_24h, sum_column
2023-08-24 18:59:52 -06:00
from llm_server.helpers import deep_sort
2023-08-23 23:11:12 -06:00
from llm_server.llm.info import get_running_model
from llm_server.netdata import get_power_states
from llm_server.routes.cache import redis
2023-08-23 23:11:12 -06:00
from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
2023-08-24 20:43:11 -06:00
# TODO: have routes/__init__.py point to the latest API version generate_stats()
2023-08-23 23:11:12 -06:00
def generate_stats():
model_name, error = get_running_model() # will return False when the fetch fails
if isinstance(model_name, bool):
2023-08-23 23:11:12 -06:00
online = False
else:
online = True
opts.running_model = model_name
2023-08-23 23:11:12 -06:00
# t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change
# if len(t) == 0:
# estimated_wait = 0
# else:
# waits = [elapsed for end, elapsed in t]
# estimated_wait = int(sum(waits) / len(waits))
active_gen_workers = get_active_gen_workers()
proompters_in_queue = len(priority_queue)
average_tps = float(redis.get('average_tps'))
if opts.average_generation_time_mode == 'database':
average_generation_time = float(redis.get('average_generation_elapsed_sec'))
average_output_tokens = float(redis.get('average_output_tokens'))
# average_generation_time_from_tps = (average_output_tokens / average_tps)
# What to use in our math that calculates the wait time.
# We could use the average TPS but we don't know the exact TPS value, only
# the backend knows that. So, let's just stick with the elapsed time.
gen_time_calc = average_generation_time
estimated_wait_sec = (
(gen_time_calc * proompters_in_queue) / opts.concurrent_gens # Calculate wait time for items in queue
) + (
active_gen_workers * gen_time_calc # Calculate wait time for in-process items
) if average_tps > 0 else 0
elif opts.average_generation_time_mode == 'minute':
average_generation_time = calculate_avg_gen_time()
gen_time_calc = average_generation_time
estimated_wait_sec = ((gen_time_calc * proompters_in_queue) / opts.concurrent_gens) + (active_gen_workers * gen_time_calc)
else:
raise Exception
2023-08-24 12:19:59 -06:00
if opts.netdata_root:
2023-08-25 15:02:40 -06:00
netdata_stats = {}
power_states = get_power_states()
for gpu, power_state in power_states.items():
netdata_stats[gpu] = {
'power_state': power_state,
# 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))
}
else:
netdata_stats = {}
2023-08-24 18:59:52 -06:00
output = {
2023-08-23 23:11:12 -06:00
'stats': {
'proompters': {
'1_min': SemaphoreCheckerThread.proompters_1_min,
'24_hrs': get_distinct_ips_24h(),
},
2023-08-27 22:24:44 -06:00
'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
2023-08-23 23:11:12 -06:00
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
'average_generation_elapsed_sec': int(gen_time_calc),
'average_tps': average_tps,
2023-08-24 20:43:11 -06:00
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
'nvidia': netdata_stats
2023-08-23 23:11:12 -06:00
},
'online': online,
'endpoints': {
'blocking': opts.full_client_api,
},
2023-08-27 22:24:44 -06:00
'queue': {
'processing': active_gen_workers,
'queued': proompters_in_queue,
'estimated_wait_sec': int(estimated_wait_sec),
},
2023-08-23 23:11:12 -06:00
'timestamp': int(time.time()),
'config': {
'gatekeeper': 'none' if opts.auth_required is False else 'token',
'context_size': opts.context_size,
2023-08-24 12:19:59 -06:00
'queue_size': opts.concurrent_gens,
'model': model_name,
2023-08-24 18:59:52 -06:00
'mode': opts.mode,
'simultaneous_requests': opts.ip_in_queue_max,
2023-08-24 18:59:52 -06:00
},
'keys': {
'openaiKeys': '',
'anthropicKeys': '',
},
2023-08-23 23:11:12 -06:00
}
2023-08-24 18:59:52 -06:00
return deep_sort(output)