update readme

2023-08-24 12:19:59 -06:00 · 2023-08-24 12:19:59 -06:00 · 21174750ea
parent 1cb6389a8a
commit 21174750ea
4 changed files with 19 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -49,4 +49,10 @@ To set up token auth, add rows to the `token_auth` table in the SQLite database.
 `expire`: UNIX timestamp of when this token expires and is not longer valid.
-`disabled`: mark the token as disabled.
+`disabled`: mark the token as disabled.
 ### To Do
 - Implement streaming
 - Add `huggingface/text-generation-inference`
 - Convince Oobabooga to implement concurrent generation
--- a/config/config.yml.sample
+++ b/config/config.yml.sample
@ -7,8 +7,9 @@ backend_url: http://x.x.x.x:5000
 mode: oobabooga
 # How many concurrent generation requests will be processed at the same time.
-# Oobabooga only supports one.
+# Oobabooga only supports one. If you're using Oobabooga, you MUST set this to 1
-concurrent_gens: 3
+# or else your estimated wait time will be incorrect.
 concurrent_gens: 1
 # The configured token limit of your backend.
 # This number is shown to clients and on the home page. (may be important later)
@ -27,6 +28,7 @@ verify_ssl: false
 # Reject all requests if they aren't authenticated with a token.
 auth_required: false
 # JS tracking code to add to the home page.
 #analytics_tracking_code: |
 #  alert("hello");
--- a/llm_server/routes/stats.py
+++ b/llm_server/routes/stats.py
@ -29,6 +29,8 @@ generation_elapsed_lock = Lock()
 def calculate_avg_gen_time():
    # TODO: calculate the average from the database. Have this be set by an option in the config
    # Get the average generation time from Redis
    average_generation_time = redis.get('average_generation_time')
    if average_generation_time is None:
--- a/llm_server/routes/v1/generate_stats.py
+++ b/llm_server/routes/v1/generate_stats.py
@ -1,5 +1,5 @@
 from datetime import datetime
 import time
 from datetime import datetime
 from llm_server import opts
 from llm_server.llm.info import get_running_model
@ -23,9 +23,11 @@ def generate_stats():
    average_generation_time = int(calculate_avg_gen_time())
    proompters_in_queue = len(priority_queue) + get_active_gen_workers()
    # TODO: https://stackoverflow.com/questions/22721579/sorting-a-nested-ordereddict-by-key-recursively
    return {
        'stats': {
-            'prompts_in_queue': proompters_in_queue,
+            'proompts_in_queue': proompters_in_queue,
            'proompters_1_min': SemaphoreCheckerThread.proompters_1_min,
            'total_proompts': get_total_proompts() if opts.show_num_prompts else None,
            'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None,
@ -37,12 +39,13 @@ def generate_stats():
        'endpoints': {
            'blocking': opts.full_client_api,
        },
-        'estimated_wait_sec': int(average_generation_time * proompters_in_queue),
+        'estimated_wait_sec': int((average_generation_time * proompters_in_queue) / opts.concurrent_gens),
        'timestamp': int(time.time()),
        'openaiKeys': '∞',
        'anthropicKeys': '∞',
        'config': {
            'gatekeeper': 'none' if opts.auth_required is False else 'token',
            'context_size': opts.context_size,
            'queue_size': opts.concurrent_gens,
        }
    }