add home template

2023-08-23 23:11:12 -06:00 · 2023-08-23 23:11:12 -06:00 · f3fe514c11
parent dca2dc570f
commit f3fe514c11
9 changed files with 200 additions and 46 deletions
--- a/config/config.yml
+++ b/config/config.yml
@ -10,6 +10,8 @@ token_limit: 7777
 backend_url: https://10.0.0.86:8083
 llm_middleware_name: proxy.chub-archive.evulid.cc
 ## STATS ##
 # Display the total_proompts item on the stats screen.
--- a/llm_server/config.py
+++ b/llm_server/config.py
@ -10,7 +10,12 @@ config_default_vars = {
    'show_num_prompts': True,
    'show_uptime': True,
 }
-config_required_vars = ['token_limit', 'concurrent_gens', 'mode']
+config_required_vars = ['token_limit', 'concurrent_gens', 'mode', 'llm_middleware_name']
 mode_ui_names = {
    'oobabooga': 'Text Gen WebUI (ooba)',
    'hf-textgen': 'UNDEFINED',
 }
 class ConfigLoader:
--- a/llm_server/opts.py
+++ b/llm_server/opts.py
@ -1,4 +1,4 @@
-# Global settings that never change after startup/init
+# Read-only global variables
 running_model = 'none'
 concurrent_gens = 3
@ -9,6 +9,7 @@ database_path = './proxy-server.db'
 auth_required = False
 log_prompts = False
 frontend_api_client = ''
 full_client_api = None
 http_host = None
 verify_ssl = True
 show_num_prompts = True
--- a/llm_server/routes/v1/init.py
+++ b/llm_server/routes/v1/init.py
@ -12,6 +12,8 @@ bp = Blueprint('v1', __name__)
 def before_request():
    if not opts.http_host:
        opts.http_host = request.headers.get("Host")
    if not opts.full_client_api:
        opts.full_client_api = f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}'
    if request.endpoint != 'v1.get_stats':
        response = require_api_key()
        if response is not None:
--- a/llm_server/routes/v1/generate_stats.py
+++ b/llm_server/routes/v1/generate_stats.py
@ -0,0 +1,48 @@
 from datetime import datetime
 import time
 from llm_server import opts
 from llm_server.llm.info import get_running_model
 from llm_server.routes.queue import priority_queue
 from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
 def generate_stats():
    model_list, error = get_running_model()  # will return False when the fetch fails
    if isinstance(model_list, bool):
        online = False
    else:
        online = True
    # t = elapsed_times.copy()  # copy since we do multiple operations and don't want it to change
    # if len(t) == 0:
    #     estimated_wait = 0
    # else:
    #     waits = [elapsed for end, elapsed in t]
    #     estimated_wait = int(sum(waits) / len(waits))
    average_generation_time = int(calculate_avg_gen_time())
    proompters_in_queue = len(priority_queue) + get_active_gen_workers()
    return {
        'stats': {
            'prompts_in_queue': proompters_in_queue,
            'proompters_1_min': SemaphoreCheckerThread.proompters_1_min,
            'total_proompts': get_total_proompts() if opts.show_num_prompts else None,
            'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
            'average_generation_elapsed_sec': average_generation_time,
        },
        'online': online,
        'mode': opts.mode,
        'model': model_list,
        'endpoints': {
            'blocking': opts.full_client_api,
        },
        'estimated_wait_sec': int(average_generation_time * proompters_in_queue),
        'timestamp': int(time.time()),
        'openaiKeys': '∞',
        'anthropicKeys': '∞',
        'config': {
            'gatekeeper': 'none' if opts.auth_required is False else 'token',
            'context_size': opts.context_size,
        }
    }
--- a/llm_server/routes/v1/proxy.py
+++ b/llm_server/routes/v1/proxy.py
@ -5,51 +5,15 @@ from flask import jsonify, request
 from llm_server import opts
 from . import bp
 from .generate_stats import generate_stats
 from .. import stats
 from ..cache import cache
 from ..queue import priority_queue
 from ..stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers
 from ...llm.info import get_running_model
@bp.route('/stats', methods=['GET'])
-# @cache.cached(timeout=5, query_string=True)
+@cache.cached(timeout=5, query_string=True)
 def get_stats():
-    model_list, error = get_running_model()  # will return False when the fetch fails
+    return jsonify(generate_stats()), 200
    if isinstance(model_list, bool):
        online = False
    else:
        online = True
    # t = elapsed_times.copy()  # copy since we do multiple operations and don't want it to change
    # if len(t) == 0:
    #     estimated_wait = 0
    # else:
    #     waits = [elapsed for end, elapsed in t]
    #     estimated_wait = int(sum(waits) / len(waits))
    average_generation_time = int(calculate_avg_gen_time())
    proompters_in_queue = len(priority_queue) + get_active_gen_workers()
    return jsonify({
        'stats': {
            'prompts_in_queue': proompters_in_queue,
            'proompters_1_min': SemaphoreCheckerThread.proompters_1_min,
            'total_proompts': stats.get_total_proompts() if opts.show_num_prompts else None,
            'uptime': int((datetime.now() - stats.server_start_time).total_seconds()) if opts.show_uptime else None,
            'average_generation_elapsed_sec': average_generation_time,
        },
        'online': online,
        'mode': opts.mode,
        'model': model_list,
        'endpoints': {
            'blocking': f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}',
        },
        'estimated_wait_sec': int(average_generation_time * proompters_in_queue),
        'timestamp': int(time.time()),
        'openaiKeys': '∞',
        'anthropicKeys': '∞',
        'config': {
            'gatekeeper': 'none' if opts.auth_required is False else 'token',
            'context_size': opts.context_size,
        }
    }), 200
--- a/llm_server/threads.py
+++ b/llm_server/threads.py
@ -0,0 +1,32 @@
 import time
 from threading import Thread
 import requests
 from llm_server import opts
 from llm_server.routes.cache import redis
 class BackendHealthCheck(Thread):
    backend_online = False
    def __init__(self):
        Thread.__init__(self)
        self.daemon = True
    def run(self):
        while True:
            if opts.mode == 'oobabooga':
                try:
                    r = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl)
                    opts.running_model = r.json()['result']
                    redis.set('backend_online', 1)
                except Exception as e:
                    redis.set('backend_online', 0)
                    # TODO: handle error
                    print(e)
            elif opts.mode == 'hf-textgen':
                pass
            else:
                raise Exception
            time.sleep(1)
--- a/server.py
+++ b/server.py
@ -1,13 +1,14 @@
 import json
 import os
 import sys
 from pathlib import Path
 from threading import Thread
-import config
+from flask import Flask, jsonify, render_template, request
 from flask import Flask, jsonify
 import config
 from llm_server import opts
-from llm_server.config import ConfigLoader, config_default_vars, config_required_vars
+from llm_server.config import ConfigLoader, config_default_vars, config_required_vars, mode_ui_names
 from llm_server.database import get_number_of_rows, init_db
 from llm_server.helpers import resolve_path
 from llm_server.routes.cache import cache, redis
@ -15,6 +16,8 @@ from llm_server.routes.helpers.http import cache_control
 from llm_server.routes.queue import start_workers
 from llm_server.routes.stats import SemaphoreCheckerThread, process_avg_gen_time
 from llm_server.routes.v1 import bp
 from llm_server.routes.v1.generate_stats import generate_stats
 from llm_server.threads import BackendHealthCheck
 script_path = os.path.dirname(os.path.realpath(__file__))
@ -57,6 +60,7 @@ if not opts.verify_ssl:
 flushed_keys = redis.flush()
 print('Flushed', len(flushed_keys), 'keys from Redis.')
 redis.set('backend_online', 0)
 if config['load_num_prompts']:
    redis.set('proompts', get_number_of_rows('prompts'))
@ -70,6 +74,7 @@ start_workers(opts.concurrent_gens)
 process_avg_gen_time_background_thread = Thread(target=process_avg_gen_time)
 process_avg_gen_time_background_thread.daemon = True
 process_avg_gen_time_background_thread.start()
 BackendHealthCheck().start()
 SemaphoreCheckerThread().start()
 app = Flask(__name__)
@ -84,9 +89,32 @@ app.register_blueprint(bp, url_prefix='/api/v1/')
@app.route('/')
@app.route('/api')
@cache.cached(timeout=5, query_string=True)
 def home():
    if not opts.full_client_api:
        opts.full_client_api = f'https://{request.headers.get("Host")}/{opts.frontend_api_client.strip("/")}'
    stats = generate_stats()
    if not bool(redis.get('backend_online')) or not stats['online']:
        running_model = estimated_wait_sec = 'offline'
    else:
        running_model = opts.running_model
        estimated_wait_sec = f"{stats['estimated_wait_sec']} seconds"
    return render_template('home.html',
                           llm_middleware_name=config['llm_middleware_name'],
                           current_model=running_model,
                           client_api=opts.full_client_api,
                           estimated_wait=estimated_wait_sec,
                           mode_name=mode_ui_names[opts.mode],
                           context_size=opts.context_size,
                           stats_json=json.dumps(stats, indent=4, ensure_ascii=False)
                           )
@app.route('/<first>')
@app.route('/<first>/<path:rest>')
@cache_control(-1)
 def fallback(first=None, rest=None):
    return jsonify({
        'error': 404,
--- a/templates/home.html
+++ b/templates/home.html
@ -0,0 +1,72 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta content="width=device-width, initial-scale=1" name="viewport"/>
    <script>
        var _paq = window._paq = window._paq || [];
        _paq.push(['trackPageView']);
        _paq.push(['enableLinkTracking']);
        (function () {
            var u = "https://mato.evulid.cc/";
            _paq.push(['setTrackerUrl', u + 'matomo.php']);
            _paq.push(['setSiteId', '10']);
            var d = document,
                g = d.createElement('script'),
                s = d.getElementsByTagName('script')[0];
            g.async = true;
            g.src = u + 'matomo.js';
            s.parentNode.insertBefore(g, s);
        })();
    </script>
    <style>
        .container {
            padding: 1em 3em;
        }
        #json {
            background-color: #ffb6c16e;
            padding: 1em;
            display: inline-block;
        }
        @media only screen and (max-width: 600px) {
            .container {
                padding: 1em;
            }
            h1 {
                font-size: 1.5em;
            }
        }
    </style>
 </head>
 <body>
 <div class="container">
    <h1 style="text-align: center;margin-top: 0;">{{ llm_middleware_name }}</h1>
    <p><strong>Current Model:</strong> <span id="model">{{ current_model }}</span></p>
    <p><strong>Client API URL:</strong> {{ client_api }}</p>
    <p><strong>Estimated Wait Time:</strong> <span id="estimatedWait">{{ estimated_wait }}</span></p>
    <br>
    <div id="oobabooga">
        <strong>Instructions:</strong>
        <ol>
            <li>Set your API type to <kbd>{{ mode_name }}</kbd></li>
            <li>Enter <kbd>{{ client_api }}</kbd> in the <kbd>Blocking API url</kbd> textbox.</li>
            <li>Click <kbd>Connect</kbd> to test the connection.</li>
            <li>Open your preset config and set <kbd>Context Size</kbd> to {{ context_size }}.</li>
            <li>Follow this guide to get set up: <a href="https://rentry.org/freellamas" target="_blank">rentry.org/freellamas</a></li>
        </ol>
    </div>
    <br><br>
    <pre id="json">{{ stats_json }}</pre>
 </div>
 </body>
 </html>