From f9b9051badfc59e7bf3e31dac5679ec97fb37f90 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Tue, 29 Aug 2023 15:46:56 -0600 Subject: [PATCH] update weighted_average_column_for_model to account for when there was an error reported, insert null for response tokens when error, correctly parse x-forwarded-for, correctly convert model reported by hf-textgen --- llm_server/database.py | 12 +++++++----- llm_server/llm/info.py | 15 +++++---------- llm_server/routes/v1/generate.py | 2 +- llm_server/routes/v1/generate_stats.py | 2 +- llm_server/threads.py | 8 +++++--- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/llm_server/database.py b/llm_server/database.py index 41a5fa1..d31e482 100644 --- a/llm_server/database.py +++ b/llm_server/database.py @@ -48,9 +48,11 @@ def init_db(): def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code, response_tokens: int = None, is_error: bool = False): prompt_tokens = len(tokenizer.encode(prompt)) - # TODO: insert None for response tokens when error - if not response_tokens: - response_tokens = len(tokenizer.encode(response)) + if not is_error: + if not response_tokens: + response_tokens = len(tokenizer.encode(response)) + else: + response_tokens = None # Sometimes we may want to insert null into the DB, but # usually we want to insert a float. @@ -127,7 +129,7 @@ def average_column_for_model(table_name, column_name, model_name): return result[0] -def weighted_average_column_for_model(table_name, column_name, model_name): +def weighted_average_column_for_model(table_name, column_name, model_name, exclude_zeros: bool = False): conn = sqlite3.connect(opts.database_path) cursor = conn.cursor() cursor.execute(f"SELECT DISTINCT model FROM {table_name}") @@ -144,7 +146,7 @@ def weighted_average_column_for_model(table_name, column_name, model_name): total_weight = 0 weighted_sum = 0 for i, (value, rowid) in enumerate(results): - if value is None: + if value is None or (exclude_zeros and value == 0): continue weight = i + 1 total_weight += weight diff --git a/llm_server/llm/info.py b/llm_server/llm/info.py index c39905b..c59a470 100644 --- a/llm_server/llm/info.py +++ b/llm_server/llm/info.py @@ -9,20 +9,15 @@ def get_running_model(): backend_response = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl) except Exception as e: return False, e - try: - r_json = backend_response.json() - return r_json['result'], None - except Exception as e: - return False, e elif opts.mode == 'hf-textgen': try: backend_response = requests.get(f'{opts.backend_url}/info', verify=opts.verify_ssl) except Exception as e: return False, e - try: - r_json = backend_response.json() - return r_json['model_id'].replace('/', '_'), None - except Exception as e: - return False, e else: raise Exception + try: + r_json = backend_response.json() + return r_json['model_id'].replace('/', '_'), None + except Exception as e: + return False, e diff --git a/llm_server/routes/v1/generate.py b/llm_server/routes/v1/generate.py index c88edde..9660360 100644 --- a/llm_server/routes/v1/generate.py +++ b/llm_server/routes/v1/generate.py @@ -28,7 +28,7 @@ def generate(): if request.headers.get('cf-connecting-ip'): client_ip = request.headers.get('cf-connecting-ip') elif request.headers.get('x-forwarded-for'): - client_ip = request.headers.get('x-forwarded-for') + client_ip = request.headers.get('x-forwarded-for').split(',')[0] else: client_ip = request.remote_addr diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index 23dfc4f..8f37f6f 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -34,7 +34,7 @@ def generate_stats(): if opts.average_generation_time_mode == 'database': average_generation_time = float(redis.get('average_generation_elapsed_sec')) - average_output_tokens = float(redis.get('average_output_tokens')) + # average_output_tokens = float(redis.get('average_output_tokens')) # average_generation_time_from_tps = (average_output_tokens / average_tps) # What to use in our math that calculates the wait time. diff --git a/llm_server/threads.py b/llm_server/threads.py index dbfb779..c37eb5e 100644 --- a/llm_server/threads.py +++ b/llm_server/threads.py @@ -38,7 +38,7 @@ class MainBackgroundThread(Thread): try: r = requests.get(f'{opts.backend_url}/info', timeout=3, verify=opts.verify_ssl) j = r.json() - opts.running_model = j['model_id'] + opts.running_model = j['model_id'].replace('/', '_') redis.set('backend_online', 1) redis.set_dict('backend_info', j) except Exception as e: @@ -48,13 +48,15 @@ class MainBackgroundThread(Thread): else: raise Exception - average_generation_elapsed_sec = weighted_average_column_for_model('prompts', 'generation_time', opts.running_model) or 0 + # exclude_zeros=True filters out rows where an error message was returned. Previously, if there was an error, 0 + # was entered into the column. The new code enters null instead but we need to be backwards compatible for now + average_generation_elapsed_sec = weighted_average_column_for_model('prompts', 'generation_time', opts.running_model, exclude_zeros=True) or 0 redis.set('average_generation_elapsed_sec', average_generation_elapsed_sec) # overall = average_column_for_model('prompts', 'generation_time', opts.running_model) # print(f'Weighted: {average_generation_elapsed_sec}, overall: {overall}') - average_output_tokens = weighted_average_column_for_model('prompts', 'response_tokens', opts.running_model) or 0 + average_output_tokens = weighted_average_column_for_model('prompts', 'response_tokens', opts.running_model, exclude_zeros=True) or 0 redis.set('average_output_tokens', average_output_tokens) # overall = average_column_for_model('prompts', 'response_tokens', opts.running_model)