From 6a09ffc8a4bc89e88df23282456fbf8a2b610b15 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sat, 26 Aug 2023 00:30:59 -0600 Subject: [PATCH] log model used in request so we can pull the correct averages when we change models --- README.md | 4 +++- llm_server/database.py | 14 ++++++++++++-- llm_server/routes/v1/generate_stats.py | 2 +- llm_server/threads.py | 20 +++++++++++--------- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 814ccd6..f8ca1e4 100644 --- a/README.md +++ b/README.md @@ -42,10 +42,12 @@ To set up token auth, add rows to the `token_auth` table in the SQLite database. ### Use -**DO NOT** lose your database. It's used for calculating the estimated wait time based on average TPS and response tokens and if you lose those stats your numbers will be inaccurate until the database fills back up again. If you change GPUs, you should probably clear the `generation_time` time column in the `prompts` table. +**DO NOT** lose your database. It's used for calculating the estimated wait time based on average TPS and response tokens and if you lose those stats your numbers will be inaccurate until the database fills back up again. If you change GPUs, you +should probably clear the `generation_time` time column in the `prompts` table. ### To Do - Implement streaming - Add `huggingface/text-generation-inference` - Convince Oobabooga to implement concurrent generation +- Make sure stats work when starting from an empty database diff --git a/llm_server/database.py b/llm_server/database.py index 803756d..269b7ba 100644 --- a/llm_server/database.py +++ b/llm_server/database.py @@ -24,6 +24,7 @@ def init_db(): response_tokens INTEGER, response_status INTEGER, generation_time FLOAT, + model TEXT, parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)), headers TEXT CHECK (headers IS NULL OR json_valid(headers)), timestamp INTEGER @@ -59,8 +60,8 @@ def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backe timestamp = int(time.time()) conn = sqlite3.connect(opts.database_path) c = conn.cursor() - c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - (ip, token, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, json.dumps(parameters), json.dumps(headers), timestamp)) + c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + (ip, token, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, opts.running_model, json.dumps(parameters), json.dumps(headers), timestamp)) conn.commit() conn.close() @@ -108,6 +109,15 @@ def average_column(table_name, column_name): return result[0] +def average_column_for_model(table_name, column_name, model_name): + conn = sqlite3.connect(opts.database_path) + cursor = conn.cursor() + cursor.execute(f"SELECT AVG({column_name}) FROM {table_name} WHERE model = ?", (model_name,)) + result = cursor.fetchone() + conn.close() + return result[0] + + def sum_column(table_name, column_name): conn = sqlite3.connect(opts.database_path) cursor = conn.cursor() diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index ad70a8d..1a28df7 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -34,7 +34,7 @@ def generate_stats(): if opts.average_generation_time_mode == 'database': average_generation_time = int(float(redis.get('average_generation_elapsed_sec'))) average_output_tokens = int(float(redis.get('average_output_tokens'))) - estimated_wait_sec = int(((average_output_tokens / average_tps) * proompters_in_queue) / opts.concurrent_gens) + estimated_wait_sec = int(((average_output_tokens / average_tps) * proompters_in_queue) / opts.concurrent_gens) if average_tps > 0 else 0 elif opts.average_generation_time_mode == 'minute': average_generation_time = int(calculate_avg_gen_time()) estimated_wait_sec = int((average_generation_time * proompters_in_queue) / opts.concurrent_gens) diff --git a/llm_server/threads.py b/llm_server/threads.py index 437472d..9d1ca15 100644 --- a/llm_server/threads.py +++ b/llm_server/threads.py @@ -4,7 +4,7 @@ from threading import Thread import requests from llm_server import opts -from llm_server.database import average_column +from llm_server.database import average_column_for_model from llm_server.routes.cache import redis @@ -24,14 +24,6 @@ class MainBackgroundThread(Thread): def run(self): while True: - average_generation_elapsed_sec = average_column('prompts', 'generation_time') if not None else 0 - redis.set('average_generation_elapsed_sec', average_generation_elapsed_sec) - - average_output_tokens = average_column('prompts', 'response_tokens') if not None else 0 - redis.set('average_output_tokens', average_output_tokens) - average_tps = round(average_output_tokens / average_generation_elapsed_sec, 2) - redis.set('average_tps', average_tps) - if opts.mode == 'oobabooga': try: r = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl) @@ -45,4 +37,14 @@ class MainBackgroundThread(Thread): pass else: raise Exception + + average_generation_elapsed_sec = average_column_for_model('prompts', 'generation_time', opts.running_model) or 0 + redis.set('average_generation_elapsed_sec', average_generation_elapsed_sec) + + average_output_tokens = average_column_for_model('prompts', 'response_tokens', opts.running_model) or 0 + redis.set('average_output_tokens', average_output_tokens) + + # Avoid division by zero + average_tps = round(average_output_tokens / average_generation_elapsed_sec, 2) if average_generation_elapsed_sec > 0 else 0 + redis.set('average_tps', average_tps) time.sleep(60)