57 lines
3.6 KiB
Python
57 lines
3.6 KiB
Python
|
import time
|
||
|
|
||
|
from llm_server import opts
|
||
|
from llm_server.cluster.backend import get_a_cluster_backend, get_backends
|
||
|
from llm_server.cluster.cluster_config import cluster_config
|
||
|
from llm_server.custom_redis import redis
|
||
|
from llm_server.database.database import weighted_average_column_for_model
|
||
|
from llm_server.llm.info import get_running_model
|
||
|
|
||
|
|
||
|
def main_background_thread():
|
||
|
while True:
|
||
|
online, offline = get_backends()
|
||
|
for backend_url in online:
|
||
|
backend_info = cluster_config.get_backend(backend_url)
|
||
|
backend_mode = backend_info['mode']
|
||
|
running_model, err = get_running_model(backend_url, backend_mode)
|
||
|
if err:
|
||
|
continue
|
||
|
|
||
|
average_generation_elapsed_sec, average_output_tokens, estimated_avg_tps = calc_stats_for_backend(backend_url, running_model, backend_mode)
|
||
|
if average_generation_elapsed_sec: # returns None on exception
|
||
|
cluster_config.set_backend_value(backend_url, 'average_generation_elapsed_sec', average_generation_elapsed_sec)
|
||
|
if average_output_tokens:
|
||
|
cluster_config.set_backend_value(backend_url, 'average_output_tokens', average_output_tokens)
|
||
|
if average_generation_elapsed_sec and average_output_tokens:
|
||
|
cluster_config.set_backend_value(backend_url, 'estimated_avg_tps', estimated_avg_tps)
|
||
|
|
||
|
default_backend_url = get_a_cluster_backend()
|
||
|
default_backend_info = cluster_config.get_backend(default_backend_url)
|
||
|
default_backend_mode = default_backend_info['mode']
|
||
|
default_running_model, err = get_running_model(default_backend_url, default_backend_mode)
|
||
|
if err:
|
||
|
continue
|
||
|
|
||
|
default_average_generation_elapsed_sec, default_average_output_tokens, default_estimated_avg_tps = calc_stats_for_backend(default_running_model, default_running_model, default_backend_mode)
|
||
|
if default_average_generation_elapsed_sec:
|
||
|
redis.set('average_generation_elapsed_sec', default_average_generation_elapsed_sec)
|
||
|
if default_average_output_tokens:
|
||
|
redis.set('average_output_tokens', default_average_output_tokens)
|
||
|
if default_average_generation_elapsed_sec and default_average_output_tokens:
|
||
|
redis.set('estimated_avg_tps', default_estimated_avg_tps)
|
||
|
time.sleep(30)
|
||
|
|
||
|
|
||
|
def calc_stats_for_backend(backend_url, running_model, backend_mode):
|
||
|
# exclude_zeros=True filters out rows where an error message was returned. Previously, if there was an error, 0
|
||
|
# was entered into the column. The new code enters null instead but we need to be backwards compatible for now.
|
||
|
average_generation_elapsed_sec = weighted_average_column_for_model('prompts', 'generation_time',
|
||
|
running_model, backend_mode, backend_url, exclude_zeros=True,
|
||
|
include_system_tokens=opts.include_system_tokens_in_stats) or 0
|
||
|
average_output_tokens = weighted_average_column_for_model('prompts', 'response_tokens',
|
||
|
running_model, backend_mode, backend_url, exclude_zeros=True,
|
||
|
include_system_tokens=opts.include_system_tokens_in_stats) or 0
|
||
|
estimated_avg_tps = round(average_output_tokens / average_generation_elapsed_sec, 2) if average_generation_elapsed_sec > 0 else 0 # Avoid division by zero
|
||
|
return average_generation_elapsed_sec, average_output_tokens, estimated_avg_tps
|