From f3fe514c119cccd7fe970cc3efc399cbda54a243 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Wed, 23 Aug 2023 23:11:12 -0600 Subject: [PATCH] add home template --- config/config.yml | 2 + llm_server/config.py | 7 ++- llm_server/opts.py | 3 +- llm_server/routes/v1/__init__.py | 2 + llm_server/routes/v1/generate_stats.py | 48 +++++++++++++++++ llm_server/routes/v1/proxy.py | 44 ++-------------- llm_server/threads.py | 32 ++++++++++++ server.py | 36 +++++++++++-- templates/home.html | 72 ++++++++++++++++++++++++++ 9 files changed, 200 insertions(+), 46 deletions(-) create mode 100644 llm_server/routes/v1/generate_stats.py create mode 100644 llm_server/threads.py create mode 100644 templates/home.html diff --git a/config/config.yml b/config/config.yml index 5316904..8c9833c 100644 --- a/config/config.yml +++ b/config/config.yml @@ -10,6 +10,8 @@ token_limit: 7777 backend_url: https://10.0.0.86:8083 +llm_middleware_name: proxy.chub-archive.evulid.cc + ## STATS ## # Display the total_proompts item on the stats screen. diff --git a/llm_server/config.py b/llm_server/config.py index 4d718ae..3f17533 100644 --- a/llm_server/config.py +++ b/llm_server/config.py @@ -10,7 +10,12 @@ config_default_vars = { 'show_num_prompts': True, 'show_uptime': True, } -config_required_vars = ['token_limit', 'concurrent_gens', 'mode'] +config_required_vars = ['token_limit', 'concurrent_gens', 'mode', 'llm_middleware_name'] + +mode_ui_names = { + 'oobabooga': 'Text Gen WebUI (ooba)', + 'hf-textgen': 'UNDEFINED', +} class ConfigLoader: diff --git a/llm_server/opts.py b/llm_server/opts.py index 8bcbdce..e550f2b 100644 --- a/llm_server/opts.py +++ b/llm_server/opts.py @@ -1,4 +1,4 @@ -# Global settings that never change after startup/init +# Read-only global variables running_model = 'none' concurrent_gens = 3 @@ -9,6 +9,7 @@ database_path = './proxy-server.db' auth_required = False log_prompts = False frontend_api_client = '' +full_client_api = None http_host = None verify_ssl = True show_num_prompts = True diff --git a/llm_server/routes/v1/__init__.py b/llm_server/routes/v1/__init__.py index 0351270..5a5715d 100644 --- a/llm_server/routes/v1/__init__.py +++ b/llm_server/routes/v1/__init__.py @@ -12,6 +12,8 @@ bp = Blueprint('v1', __name__) def before_request(): if not opts.http_host: opts.http_host = request.headers.get("Host") + if not opts.full_client_api: + opts.full_client_api = f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}' if request.endpoint != 'v1.get_stats': response = require_api_key() if response is not None: diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py new file mode 100644 index 0000000..d9da9cc --- /dev/null +++ b/llm_server/routes/v1/generate_stats.py @@ -0,0 +1,48 @@ +from datetime import datetime +import time + +from llm_server import opts +from llm_server.llm.info import get_running_model +from llm_server.routes.queue import priority_queue +from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time + + +def generate_stats(): + model_list, error = get_running_model() # will return False when the fetch fails + if isinstance(model_list, bool): + online = False + else: + online = True + + # t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change + # if len(t) == 0: + # estimated_wait = 0 + # else: + # waits = [elapsed for end, elapsed in t] + # estimated_wait = int(sum(waits) / len(waits)) + + average_generation_time = int(calculate_avg_gen_time()) + proompters_in_queue = len(priority_queue) + get_active_gen_workers() + return { + 'stats': { + 'prompts_in_queue': proompters_in_queue, + 'proompters_1_min': SemaphoreCheckerThread.proompters_1_min, + 'total_proompts': get_total_proompts() if opts.show_num_prompts else None, + 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, + 'average_generation_elapsed_sec': average_generation_time, + }, + 'online': online, + 'mode': opts.mode, + 'model': model_list, + 'endpoints': { + 'blocking': opts.full_client_api, + }, + 'estimated_wait_sec': int(average_generation_time * proompters_in_queue), + 'timestamp': int(time.time()), + 'openaiKeys': '∞', + 'anthropicKeys': '∞', + 'config': { + 'gatekeeper': 'none' if opts.auth_required is False else 'token', + 'context_size': opts.context_size, + } + } diff --git a/llm_server/routes/v1/proxy.py b/llm_server/routes/v1/proxy.py index 39f2115..585ce55 100644 --- a/llm_server/routes/v1/proxy.py +++ b/llm_server/routes/v1/proxy.py @@ -5,51 +5,15 @@ from flask import jsonify, request from llm_server import opts from . import bp +from .generate_stats import generate_stats from .. import stats +from ..cache import cache from ..queue import priority_queue from ..stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers from ...llm.info import get_running_model @bp.route('/stats', methods=['GET']) -# @cache.cached(timeout=5, query_string=True) +@cache.cached(timeout=5, query_string=True) def get_stats(): - model_list, error = get_running_model() # will return False when the fetch fails - if isinstance(model_list, bool): - online = False - else: - online = True - - # t = elapsed_times.copy() # copy since we do multiple operations and don't want it to change - # if len(t) == 0: - # estimated_wait = 0 - # else: - # waits = [elapsed for end, elapsed in t] - # estimated_wait = int(sum(waits) / len(waits)) - - average_generation_time = int(calculate_avg_gen_time()) - proompters_in_queue = len(priority_queue) + get_active_gen_workers() - - return jsonify({ - 'stats': { - 'prompts_in_queue': proompters_in_queue, - 'proompters_1_min': SemaphoreCheckerThread.proompters_1_min, - 'total_proompts': stats.get_total_proompts() if opts.show_num_prompts else None, - 'uptime': int((datetime.now() - stats.server_start_time).total_seconds()) if opts.show_uptime else None, - 'average_generation_elapsed_sec': average_generation_time, - }, - 'online': online, - 'mode': opts.mode, - 'model': model_list, - 'endpoints': { - 'blocking': f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}', - }, - 'estimated_wait_sec': int(average_generation_time * proompters_in_queue), - 'timestamp': int(time.time()), - 'openaiKeys': '∞', - 'anthropicKeys': '∞', - 'config': { - 'gatekeeper': 'none' if opts.auth_required is False else 'token', - 'context_size': opts.context_size, - } - }), 200 + return jsonify(generate_stats()), 200 diff --git a/llm_server/threads.py b/llm_server/threads.py new file mode 100644 index 0000000..bdbcee2 --- /dev/null +++ b/llm_server/threads.py @@ -0,0 +1,32 @@ +import time +from threading import Thread + +import requests + +from llm_server import opts +from llm_server.routes.cache import redis + + +class BackendHealthCheck(Thread): + backend_online = False + + def __init__(self): + Thread.__init__(self) + self.daemon = True + + def run(self): + while True: + if opts.mode == 'oobabooga': + try: + r = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl) + opts.running_model = r.json()['result'] + redis.set('backend_online', 1) + except Exception as e: + redis.set('backend_online', 0) + # TODO: handle error + print(e) + elif opts.mode == 'hf-textgen': + pass + else: + raise Exception + time.sleep(1) diff --git a/server.py b/server.py index 8742750..93d4afa 100644 --- a/server.py +++ b/server.py @@ -1,13 +1,14 @@ +import json import os import sys from pathlib import Path from threading import Thread -import config -from flask import Flask, jsonify +from flask import Flask, jsonify, render_template, request +import config from llm_server import opts -from llm_server.config import ConfigLoader, config_default_vars, config_required_vars +from llm_server.config import ConfigLoader, config_default_vars, config_required_vars, mode_ui_names from llm_server.database import get_number_of_rows, init_db from llm_server.helpers import resolve_path from llm_server.routes.cache import cache, redis @@ -15,6 +16,8 @@ from llm_server.routes.helpers.http import cache_control from llm_server.routes.queue import start_workers from llm_server.routes.stats import SemaphoreCheckerThread, process_avg_gen_time from llm_server.routes.v1 import bp +from llm_server.routes.v1.generate_stats import generate_stats +from llm_server.threads import BackendHealthCheck script_path = os.path.dirname(os.path.realpath(__file__)) @@ -57,6 +60,7 @@ if not opts.verify_ssl: flushed_keys = redis.flush() print('Flushed', len(flushed_keys), 'keys from Redis.') +redis.set('backend_online', 0) if config['load_num_prompts']: redis.set('proompts', get_number_of_rows('prompts')) @@ -70,6 +74,7 @@ start_workers(opts.concurrent_gens) process_avg_gen_time_background_thread = Thread(target=process_avg_gen_time) process_avg_gen_time_background_thread.daemon = True process_avg_gen_time_background_thread.start() +BackendHealthCheck().start() SemaphoreCheckerThread().start() app = Flask(__name__) @@ -84,9 +89,32 @@ app.register_blueprint(bp, url_prefix='/api/v1/') @app.route('/') +@app.route('/api') +@cache.cached(timeout=5, query_string=True) +def home(): + if not opts.full_client_api: + opts.full_client_api = f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}' + stats = generate_stats() + + if not bool(redis.get('backend_online')) or not stats['online']: + running_model = estimated_wait_sec = 'offline' + else: + running_model = opts.running_model + estimated_wait_sec = f"{stats['estimated_wait_sec']} seconds" + + return render_template('home.html', + llm_middleware_name=config['llm_middleware_name'], + current_model=running_model, + client_api=opts.full_client_api, + estimated_wait=estimated_wait_sec, + mode_name=mode_ui_names[opts.mode], + context_size=opts.context_size, + stats_json=json.dumps(stats, indent=4, ensure_ascii=False) + ) + + @app.route('/') @app.route('//') -@cache_control(-1) def fallback(first=None, rest=None): return jsonify({ 'error': 404, diff --git a/templates/home.html b/templates/home.html new file mode 100644 index 0000000..66e787d --- /dev/null +++ b/templates/home.html @@ -0,0 +1,72 @@ + + + + + + + + + + +
+

{{ llm_middleware_name }}

+ +

Current Model: {{ current_model }}

+

Client API URL: {{ client_api }}

+

Estimated Wait Time: {{ estimated_wait }}

+ +
+ +
+ Instructions: +
    +
  1. Set your API type to {{ mode_name }}
  2. +
  3. Enter {{ client_api }} in the Blocking API url textbox.
  4. +
  5. Click Connect to test the connection.
  6. +
  7. Open your preset config and set Context Size to {{ context_size }}.
  8. +
  9. Follow this guide to get set up: rentry.org/freellamas
  10. +
+
+ +

+ +
{{ stats_json }}
+
+ + +