diff --git a/README.md b/README.md index 77b73cb..485cb91 100644 --- a/README.md +++ b/README.md @@ -49,4 +49,10 @@ To set up token auth, add rows to the `token_auth` table in the SQLite database. `expire`: UNIX timestamp of when this token expires and is not longer valid. -`disabled`: mark the token as disabled. \ No newline at end of file +`disabled`: mark the token as disabled. + +### To Do + +- Implement streaming +- Add `huggingface/text-generation-inference` +- Convince Oobabooga to implement concurrent generation diff --git a/config/config.yml.sample b/config/config.yml.sample index a20682a..04efc52 100644 --- a/config/config.yml.sample +++ b/config/config.yml.sample @@ -7,8 +7,9 @@ backend_url: http://x.x.x.x:5000 mode: oobabooga # How many concurrent generation requests will be processed at the same time. -# Oobabooga only supports one. -concurrent_gens: 3 +# Oobabooga only supports one. If you're using Oobabooga, you MUST set this to 1 +# or else your estimated wait time will be incorrect. +concurrent_gens: 1 # The configured token limit of your backend. # This number is shown to clients and on the home page. (may be important later) @@ -27,6 +28,7 @@ verify_ssl: false # Reject all requests if they aren't authenticated with a token. auth_required: false +# JS tracking code to add to the home page. #analytics_tracking_code: | # alert("hello"); diff --git a/llm_server/routes/stats.py b/llm_server/routes/stats.py index 806baaf..1454a5f 100644 --- a/llm_server/routes/stats.py +++ b/llm_server/routes/stats.py @@ -29,6 +29,8 @@ generation_elapsed_lock = Lock() def calculate_avg_gen_time(): + # TODO: calculate the average from the database. Have this be set by an option in the config + # Get the average generation time from Redis average_generation_time = redis.get('average_generation_time') if average_generation_time is None: diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index d9da9cc..8104397 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -1,5 +1,5 @@ -from datetime import datetime import time +from datetime import datetime from llm_server import opts from llm_server.llm.info import get_running_model @@ -23,9 +23,11 @@ def generate_stats(): average_generation_time = int(calculate_avg_gen_time()) proompters_in_queue = len(priority_queue) + get_active_gen_workers() + + # TODO: https://stackoverflow.com/questions/22721579/sorting-a-nested-ordereddict-by-key-recursively return { 'stats': { - 'prompts_in_queue': proompters_in_queue, + 'proompts_in_queue': proompters_in_queue, 'proompters_1_min': SemaphoreCheckerThread.proompters_1_min, 'total_proompts': get_total_proompts() if opts.show_num_prompts else None, 'uptime': int((datetime.now() - server_start_time).total_seconds()) if opts.show_uptime else None, @@ -37,12 +39,13 @@ def generate_stats(): 'endpoints': { 'blocking': opts.full_client_api, }, - 'estimated_wait_sec': int(average_generation_time * proompters_in_queue), + 'estimated_wait_sec': int((average_generation_time * proompters_in_queue) / opts.concurrent_gens), 'timestamp': int(time.time()), 'openaiKeys': '∞', 'anthropicKeys': '∞', 'config': { 'gatekeeper': 'none' if opts.auth_required is False else 'token', 'context_size': opts.context_size, + 'queue_size': opts.concurrent_gens, } }