2023-10-27 19:19:22 -06:00
5 changed files with 10 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -43,6 +43,8 @@ To set up token auth, add rows to the `token_auth` table in the SQLite database.
 ### Use
 Flask may give unusual errors when running `python server.py`. I think this is coming from Flask-Socket. Running with Gunicorn seems to fix the issue: `gunicorn -b :5000 --worker-class gevent server:app`
 ### To Do
--- a/llm_server/cluster/model_choices.py
+++ b/llm_server/cluster/model_choices.py
@ -73,7 +73,7 @@ def get_model_choices(regen: bool = False):
        default_estimated_wait_sec = calculate_wait_time(default_average_generation_elapsed_sec, default_proompters_in_queue, default_backend_info['concurrent_gens'], default_active_gen_workers)
        default_backend_dict = {
-            'client_api': f'https://{base_client_api}/v1',
+            'client_api': f'https://{base_client_api}',
            'ws_client_api': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
            'openai_client_api': f'https://{base_client_api}/openai' if opts.enable_openi_compatible_backend else 'disabled',
            'estimated_wait': default_estimated_wait_sec,
--- a/llm_server/routes/v1/generate_stream.py
+++ b/llm_server/routes/v1/generate_stream.py
@ -18,12 +18,13 @@ from ...sock import sock
 # Stacking the @sock.route() creates a TypeError error on the /v1/stream endpoint.
 # We solve this by splitting the routes
-@bp.route('/stream')
+@bp.route('/v1/stream')
-def stream():
+@bp.route('/<model_name>/v1/stream')
 def stream(model_name=None):
    return 'This is a websocket endpoint.', 400
-@sock.route('/stream', bp=bp)
+@sock.route('/v1/stream', bp=bp)
 def stream_without_model(ws):
    do_stream(ws, model_name=None)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,6 @@
 flask~=2.3.3
 pyyaml~=6.0.1
-flask_caching
+Flask-Caching==2.0.2
 requests~=2.31.0
 tiktoken~=0.5.0
 gevent~=23.9.0.post1
--- a/server.py
+++ b/server.py
@ -68,9 +68,9 @@ from llm_server.llm import redis
 from llm_server.routes.v1.generate_stats import generate_stats
 app = Flask(__name__)
 init_socketio(app)
 app.register_blueprint(bp, url_prefix='/api/')
 app.register_blueprint(openai_bp, url_prefix='/api/openai/v1/')
 init_socketio(app)
 flask_cache.init_app(app)
 flask_cache.clear()
@ -131,7 +131,7 @@ def home():
                           default_active_gen_workers=default_backend_info['processing'],
                           default_proompters_in_queue=default_backend_info['queued'],
                           current_model=opts.manual_model_name if opts.manual_model_name else None,  # else running_model,
-                           client_api=f'https://{base_client_api}/v1',
+                           client_api=f'https://{base_client_api}',
                           ws_client_api=f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else 'disabled',
                           default_estimated_wait=default_estimated_wait_sec,
                           mode_name=mode_ui_names[opts.mode][0],