diff --git a/README.md b/README.md index 429e246..4e827ca 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ To set up token auth, add rows to the `token_auth` table in the SQLite database. ### Use +Flask may give unusual errors when running `python server.py`. I think this is coming from Flask-Socket. Running with Gunicorn seems to fix the issue: `gunicorn -b :5000 --worker-class gevent server:app` + ### To Do diff --git a/llm_server/cluster/model_choices.py b/llm_server/cluster/model_choices.py index f8383fe..4b02b97 100644 --- a/llm_server/cluster/model_choices.py +++ b/llm_server/cluster/model_choices.py @@ -73,7 +73,7 @@ def get_model_choices(regen: bool = False): default_estimated_wait_sec = calculate_wait_time(default_average_generation_elapsed_sec, default_proompters_in_queue, default_backend_info['concurrent_gens'], default_active_gen_workers) default_backend_dict = { - 'client_api': f'https://{base_client_api}/v1', + 'client_api': f'https://{base_client_api}', 'ws_client_api': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None, 'openai_client_api': f'https://{base_client_api}/openai' if opts.enable_openi_compatible_backend else 'disabled', 'estimated_wait': default_estimated_wait_sec, diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py index c0a0927..d6328d1 100644 --- a/llm_server/routes/v1/generate_stream.py +++ b/llm_server/routes/v1/generate_stream.py @@ -18,12 +18,13 @@ from ...sock import sock # Stacking the @sock.route() creates a TypeError error on the /v1/stream endpoint. # We solve this by splitting the routes -@bp.route('/stream') -def stream(): +@bp.route('/v1/stream') +@bp.route('//v1/stream') +def stream(model_name=None): return 'This is a websocket endpoint.', 400 -@sock.route('/stream', bp=bp) +@sock.route('/v1/stream', bp=bp) def stream_without_model(ws): do_stream(ws, model_name=None) diff --git a/requirements.txt b/requirements.txt index df16bea..28e818f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ flask~=2.3.3 pyyaml~=6.0.1 -flask_caching +Flask-Caching==2.0.2 requests~=2.31.0 tiktoken~=0.5.0 gevent~=23.9.0.post1 diff --git a/server.py b/server.py index 0eba490..6c06303 100644 --- a/server.py +++ b/server.py @@ -68,9 +68,9 @@ from llm_server.llm import redis from llm_server.routes.v1.generate_stats import generate_stats app = Flask(__name__) -init_socketio(app) app.register_blueprint(bp, url_prefix='/api/') app.register_blueprint(openai_bp, url_prefix='/api/openai/v1/') +init_socketio(app) flask_cache.init_app(app) flask_cache.clear() @@ -131,7 +131,7 @@ def home(): default_active_gen_workers=default_backend_info['processing'], default_proompters_in_queue=default_backend_info['queued'], current_model=opts.manual_model_name if opts.manual_model_name else None, # else running_model, - client_api=f'https://{base_client_api}/v1', + client_api=f'https://{base_client_api}', ws_client_api=f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else 'disabled', default_estimated_wait=default_estimated_wait_sec, mode_name=mode_ui_names[opts.mode][0],