diff --git a/README.md b/README.md
index 429e246..4e827ca 100644
--- a/README.md
+++ b/README.md
@@ -43,6 +43,8 @@ To set up token auth, add rows to the `token_auth` table in the SQLite database.
 
 ### Use
 
+Flask may give unusual errors when running `python server.py`. I think this is coming from Flask-Socket. Running with Gunicorn seems to fix the issue: `gunicorn -b :5000 --worker-class gevent server:app`
+
 
 
 ### To Do
diff --git a/llm_server/cluster/model_choices.py b/llm_server/cluster/model_choices.py
index f8383fe..4b02b97 100644
--- a/llm_server/cluster/model_choices.py
+++ b/llm_server/cluster/model_choices.py
@@ -73,7 +73,7 @@ def get_model_choices(regen: bool = False):
         default_estimated_wait_sec = calculate_wait_time(default_average_generation_elapsed_sec, default_proompters_in_queue, default_backend_info['concurrent_gens'], default_active_gen_workers)
 
         default_backend_dict = {
-            'client_api': f'https://{base_client_api}/v1',
+            'client_api': f'https://{base_client_api}',
             'ws_client_api': f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else None,
             'openai_client_api': f'https://{base_client_api}/openai' if opts.enable_openi_compatible_backend else 'disabled',
             'estimated_wait': default_estimated_wait_sec,
diff --git a/llm_server/routes/v1/generate_stream.py b/llm_server/routes/v1/generate_stream.py
index c0a0927..d6328d1 100644
--- a/llm_server/routes/v1/generate_stream.py
+++ b/llm_server/routes/v1/generate_stream.py
@@ -18,12 +18,13 @@ from ...sock import sock
 # Stacking the @sock.route() creates a TypeError error on the /v1/stream endpoint.
 # We solve this by splitting the routes
 
-@bp.route('/stream')
-def stream():
+@bp.route('/v1/stream')
+@bp.route('/<model_name>/v1/stream')
+def stream(model_name=None):
     return 'This is a websocket endpoint.', 400
 
 
-@sock.route('/stream', bp=bp)
+@sock.route('/v1/stream', bp=bp)
 def stream_without_model(ws):
     do_stream(ws, model_name=None)
 
diff --git a/requirements.txt b/requirements.txt
index df16bea..28e818f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 flask~=2.3.3
 pyyaml~=6.0.1
-flask_caching
+Flask-Caching==2.0.2
 requests~=2.31.0
 tiktoken~=0.5.0
 gevent~=23.9.0.post1
diff --git a/server.py b/server.py
index 0eba490..6c06303 100644
--- a/server.py
+++ b/server.py
@@ -68,9 +68,9 @@ from llm_server.llm import redis
 from llm_server.routes.v1.generate_stats import generate_stats
 
 app = Flask(__name__)
-init_socketio(app)
 app.register_blueprint(bp, url_prefix='/api/')
 app.register_blueprint(openai_bp, url_prefix='/api/openai/v1/')
+init_socketio(app)
 flask_cache.init_app(app)
 flask_cache.clear()
 
@@ -131,7 +131,7 @@ def home():
                            default_active_gen_workers=default_backend_info['processing'],
                            default_proompters_in_queue=default_backend_info['queued'],
                            current_model=opts.manual_model_name if opts.manual_model_name else None,  # else running_model,
-                           client_api=f'https://{base_client_api}/v1',
+                           client_api=f'https://{base_client_api}',
                            ws_client_api=f'wss://{base_client_api}/v1/stream' if opts.enable_streaming else 'disabled',
                            default_estimated_wait=default_estimated_wait_sec,
                            mode_name=mode_ui_names[opts.mode][0],