2023-08-23 23:11:12 -06:00
|
|
|
import time
|
2023-08-24 12:19:59 -06:00
|
|
|
from datetime import datetime
|
2023-08-23 23:11:12 -06:00
|
|
|
|
|
|
|
from llm_server import opts
|
2023-09-20 20:30:31 -06:00
|
|
|
from llm_server.database.database import get_distinct_ips_24h, sum_column
|
2023-09-17 18:33:57 -06:00
|
|
|
from llm_server.helpers import deep_sort, round_up_base
|
2023-08-23 23:11:12 -06:00
|
|
|
from llm_server.llm.info import get_running_model
|
2023-08-27 22:17:21 -06:00
|
|
|
from llm_server.netdata import get_power_states
|
2023-09-25 17:20:21 -06:00
|
|
|
from llm_server.routes.cache import redis
|
2023-08-23 23:11:12 -06:00
|
|
|
from llm_server.routes.queue import priority_queue
|
2023-09-27 19:39:04 -06:00
|
|
|
from llm_server.routes.stats import get_active_gen_workers, get_total_proompts, server_start_time
|
2023-08-23 23:11:12 -06:00
|
|
|
|
|
|
|
|
2023-09-17 18:33:57 -06:00
|
|
|
def calculate_wait_time(gen_time_calc, proompters_in_queue, concurrent_gens, active_gen_workers):
|
2023-09-23 21:17:13 -06:00
|
|
|
if active_gen_workers < concurrent_gens:
|
2023-09-23 21:09:21 -06:00
|
|
|
return 0
|
2023-09-23 21:17:13 -06:00
|
|
|
elif active_gen_workers >= concurrent_gens:
|
2023-09-17 18:33:57 -06:00
|
|
|
# Calculate how long it will take to complete the currently running gens and the queued requests.
|
|
|
|
# If the proompters in the queue are equal to the number of workers, just use the calculated generation time.
|
|
|
|
# Otherwise, use how many requests we can process concurrently times the calculated generation time. Then, round
|
|
|
|
# that number up to the nearest base gen_time_calc (ie. if gen_time_calc is 8 and the calculated number is 11.6, we will get 18). Finally,
|
|
|
|
# Add gen_time_calc to the time to account for the currently running generations.
|
|
|
|
# This assumes that all active workers will finish at the same time, which is unlikely.
|
|
|
|
# Regardless, this is the most accurate estimate we can get without tracking worker elapsed times.
|
|
|
|
proompters_in_queue_wait_time = gen_time_calc if (proompters_in_queue / concurrent_gens) <= 1 \
|
2023-09-17 20:22:17 -06:00
|
|
|
else round_up_base(((proompters_in_queue / concurrent_gens) * gen_time_calc), base=gen_time_calc)
|
2023-09-23 21:17:13 -06:00
|
|
|
return proompters_in_queue_wait_time + gen_time_calc if active_gen_workers > 0 else 0
|
2023-09-17 18:33:57 -06:00
|
|
|
elif proompters_in_queue == 0 and active_gen_workers == 0:
|
|
|
|
# No queue, no workers
|
|
|
|
return 0
|
|
|
|
else:
|
|
|
|
# No queue
|
|
|
|
return gen_time_calc
|
|
|
|
|
|
|
|
|
2023-08-24 20:43:11 -06:00
|
|
|
# TODO: have routes/__init__.py point to the latest API version generate_stats()
|
|
|
|
|
2023-09-25 17:20:21 -06:00
|
|
|
def generate_stats(regen: bool = False):
|
|
|
|
if not regen:
|
|
|
|
c = redis.get('proxy_stats', dict)
|
|
|
|
if c:
|
|
|
|
return c
|
|
|
|
|
2023-08-24 21:10:00 -06:00
|
|
|
model_name, error = get_running_model() # will return False when the fetch fails
|
|
|
|
if isinstance(model_name, bool):
|
2023-08-23 23:11:12 -06:00
|
|
|
online = False
|
|
|
|
else:
|
|
|
|
online = True
|
2023-09-26 13:32:33 -06:00
|
|
|
redis.set('running_model', model_name)
|
2023-08-23 23:11:12 -06:00
|
|
|
|
|
|
|
# t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change
|
|
|
|
# if len(t) == 0:
|
|
|
|
# estimated_wait = 0
|
|
|
|
# else:
|
|
|
|
# waits = [elapsed for end, elapsed in t]
|
|
|
|
# estimated_wait = int(sum(waits) / len(waits))
|
|
|
|
|
2023-08-27 22:17:21 -06:00
|
|
|
active_gen_workers = get_active_gen_workers()
|
2023-08-25 12:20:16 -06:00
|
|
|
proompters_in_queue = len(priority_queue)
|
2023-09-27 18:36:51 -06:00
|
|
|
|
|
|
|
# This is so wildly inaccurate it's disabled until I implement stats reporting into VLLM.
|
|
|
|
# estimated_avg_tps = redis.get('estimated_avg_tps', float, default=0)
|
2023-08-24 16:47:14 -06:00
|
|
|
|
2023-09-27 19:39:04 -06:00
|
|
|
average_generation_time = redis.get('average_generation_elapsed_sec', float, default=0)
|
|
|
|
estimated_wait_sec = calculate_wait_time(average_generation_time, proompters_in_queue, opts.concurrent_gens, active_gen_workers)
|
2023-08-24 12:19:59 -06:00
|
|
|
|
2023-08-24 21:36:00 -06:00
|
|
|
if opts.netdata_root:
|
2023-08-25 15:02:40 -06:00
|
|
|
netdata_stats = {}
|
|
|
|
power_states = get_power_states()
|
|
|
|
for gpu, power_state in power_states.items():
|
|
|
|
netdata_stats[gpu] = {
|
|
|
|
'power_state': power_state,
|
|
|
|
# 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))
|
|
|
|
}
|
2023-08-24 21:36:00 -06:00
|
|
|
else:
|
|
|
|
netdata_stats = {}
|
|
|
|
|
2023-09-25 17:20:21 -06:00
|
|
|
base_client_api = redis.get('base_client_api', str)
|
2023-09-28 03:44:30 -06:00
|
|
|
proompters_5_min = len(redis.zrangebyscore('recent_prompters', time.time() - 5 * 60, '+inf'))
|
2023-09-17 19:06:53 -06:00
|
|
|
|
2023-08-24 18:59:52 -06:00
|
|
|
output = {
|
2023-08-23 23:11:12 -06:00
|
|
|
'stats': {
|
2023-08-25 12:20:16 -06:00
|
|
|
'proompters': {
|
2023-09-20 21:21:22 -06:00
|
|
|
'5_min': proompters_5_min,
|
2023-08-25 12:20:16 -06:00
|
|
|
'24_hrs': get_distinct_ips_24h(),
|
|
|
|
},
|
2023-08-27 22:24:44 -06:00
|
|
|
'proompts_total': get_total_proompts() if opts.show_num_prompts else None,
|
2023-08-23 23:11:12 -06:00
|
|
|
'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
|
2023-09-27 19:39:04 -06:00
|
|
|
'average_generation_elapsed_sec': int(average_generation_time),
|
2023-09-27 18:36:51 -06:00
|
|
|
# 'estimated_avg_tps': estimated_avg_tps,
|
2023-08-24 20:43:11 -06:00
|
|
|
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
|
2023-08-23 23:11:12 -06:00
|
|
|
},
|
|
|
|
'online': online,
|
|
|
|
'endpoints': {
|
2023-09-17 18:55:36 -06:00
|
|
|
'blocking': f'https://{base_client_api}',
|
|
|
|
'streaming': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
|
2023-08-23 23:11:12 -06:00
|
|
|
},
|
2023-08-27 22:24:44 -06:00
|
|
|
'queue': {
|
|
|
|
'processing': active_gen_workers,
|
|
|
|
'queued': proompters_in_queue,
|
|
|
|
'estimated_wait_sec': int(estimated_wait_sec),
|
|
|
|
},
|
2023-08-23 23:11:12 -06:00
|
|
|
'timestamp': int(time.time()),
|
|
|
|
'config': {
|
|
|
|
'gatekeeper': 'none' if opts.auth_required is False else 'token',
|
|
|
|
'context_size': opts.context_size,
|
2023-08-29 13:46:41 -06:00
|
|
|
'concurrent': opts.concurrent_gens,
|
2023-09-12 10:30:45 -06:00
|
|
|
'model': opts.manual_model_name if opts.manual_model_name else model_name,
|
2023-08-24 18:59:52 -06:00
|
|
|
'mode': opts.mode,
|
2023-09-11 20:47:19 -06:00
|
|
|
'simultaneous_requests_per_ip': opts.simultaneous_requests_per_ip,
|
2023-08-24 18:59:52 -06:00
|
|
|
},
|
|
|
|
'keys': {
|
|
|
|
'openaiKeys': '∞',
|
|
|
|
'anthropicKeys': '∞',
|
|
|
|
},
|
2023-08-29 13:46:41 -06:00
|
|
|
'backend_info': redis.get_dict('backend_info') if opts.show_backend_info else None,
|
2023-09-28 03:54:20 -06:00
|
|
|
'nvidia': netdata_stats
|
2023-08-23 23:11:12 -06:00
|
|
|
}
|
2023-09-25 17:20:21 -06:00
|
|
|
result = deep_sort(output)
|
|
|
|
|
|
|
|
# It may take a bit to get the base client API, so don't cache until then.
|
|
|
|
if base_client_api:
|
|
|
|
redis.set_dict('proxy_stats', result) # Cache with no expiry
|
|
|
|
return result
|