local-llm-server/llm_server/threads.py

46 lines
1.6 KiB
Python

import time
from threading import Thread
import requests
from llm_server import opts
from llm_server.database import average_column
from llm_server.routes.cache import redis
class BackendHealthCheck(Thread):
backend_online = False
def __init__(self):
Thread.__init__(self)
self.daemon = True
redis.set('average_generation_elapsed_sec', 0)
redis.set('average_tps', 0)
redis.set('average_output_tokens', 0)
redis.set('backend_online', 0)
def run(self):
while True:
average_generation_elapsed_sec = average_column('prompts', 'generation_time') if not None else 0
redis.set('average_generation_elapsed_sec', average_generation_elapsed_sec)
average_output_tokens = average_column('prompts', 'response_tokens') if not None else 0
redis.set('average_output_tokens', average_output_tokens)
average_tps = round(average_output_tokens / average_generation_elapsed_sec, 2)
redis.set('average_tps', average_tps)
if opts.mode == 'oobabooga':
try:
r = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl)
opts.running_model = r.json()['result']
redis.set('backend_online', 1)
except Exception as e:
redis.set('backend_online', 0)
# TODO: handle error
print(e)
elif opts.mode == 'hf-textgen':
pass
else:
raise Exception
time.sleep(60)