diff --git a/llm_server/database.py b/llm_server/database.py index 52f74b8..486e2c5 100644 --- a/llm_server/database.py +++ b/llm_server/database.py @@ -23,6 +23,7 @@ def init_db(): response TEXT, response_tokens INTEGER, response_status INTEGER, + generation_time INTEGER, parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)), headers TEXT CHECK (headers IS NULL OR json_valid(headers)), timestamp INTEGER @@ -43,7 +44,7 @@ def init_db(): conn.close() -def log_prompt(ip, token, prompt, response, parameters, headers, backend_response_code): +def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code): prompt_tokens = len(tokenizer.encode(prompt)) response_tokens = len(tokenizer.encode(response)) @@ -53,8 +54,8 @@ def log_prompt(ip, token, prompt, response, parameters, headers, backend_respons timestamp = int(time.time()) conn = sqlite3.connect(opts.database_path) c = conn.cursor() - c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", - (ip, token, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp)) + c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + (ip, token, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, json.dumps(parameters), json.dumps(headers), timestamp)) conn.commit() conn.close() diff --git a/llm_server/routes/stats.py b/llm_server/routes/stats.py index cf07127..806baaf 100644 --- a/llm_server/routes/stats.py +++ b/llm_server/routes/stats.py @@ -10,21 +10,22 @@ server_start_time = datetime.now() # TODO: have a background thread put the averages in a variable so we don't end up with massive arrays -wait_in_queue_elapsed = [] -wait_in_queue_elapsed_lock = Lock() +# wait_in_queue_elapsed = [] +# wait_in_queue_elapsed_lock = Lock() generation_elapsed = [] generation_elapsed_lock = Lock() -def elapsed_times_cleanup(): - global wait_in_queue_elapsed - while True: - current_time = time.time() - with wait_in_queue_elapsed_lock: - global wait_in_queue_elapsed - wait_in_queue_elapsed = [(end_time, elapsed_time) for end_time, elapsed_time in wait_in_queue_elapsed if current_time - end_time <= 60] - time.sleep(1) +# TODO: do I need this? +# def elapsed_times_cleanup(): +# global wait_in_queue_elapsed +# while True: +# current_time = time.time() +# with wait_in_queue_elapsed_lock: +# global wait_in_queue_elapsed +# wait_in_queue_elapsed = [(end_time, elapsed_time) for end_time, elapsed_time in wait_in_queue_elapsed if current_time - end_time <= 60] +# time.sleep(1) def calculate_avg_gen_time(): @@ -37,10 +38,18 @@ def calculate_avg_gen_time(): def process_avg_gen_time(): + global generation_elapsed while True: with generation_elapsed_lock: + # Get the current time + current_time = time.time() + + # Remove data older than 3 minutes + three_minutes_ago = current_time - 180 + generation_elapsed[:] = [(end, elapsed) for end, elapsed in generation_elapsed if end >= three_minutes_ago] + # Get the data from the last minute - one_minute_ago = time.time() - 60 + one_minute_ago = current_time - 60 recent_data = [elapsed for end, elapsed in generation_elapsed if end >= one_minute_ago] # Calculate the average diff --git a/llm_server/routes/v1/generate.py b/llm_server/routes/v1/generate.py index e516fc3..b05ffac 100644 --- a/llm_server/routes/v1/generate.py +++ b/llm_server/routes/v1/generate.py @@ -76,7 +76,7 @@ def generate(): else: raise Exception - log_prompt(client_ip, token, request_json_body['prompt'], backend_response, parameters, dict(request.headers), response.status_code) + log_prompt(client_ip, token, request_json_body['prompt'], backend_response, elapsed_time, parameters, dict(request.headers), response.status_code) return jsonify({ 'code': 500, 'error': 'failed to reach backend', @@ -95,7 +95,7 @@ def generate(): else: raise Exception - log_prompt(client_ip, token, request_json_body['prompt'], backend_response, parameters, dict(request.headers), response.status_code) + log_prompt(client_ip, token, request_json_body['prompt'], backend_response, elapsed_time, parameters, dict(request.headers), response.status_code) return jsonify({ **response_json_body }), 200 @@ -111,7 +111,7 @@ def generate(): } else: raise Exception - log_prompt(client_ip, token, request_json_body['prompt'], backend_response, parameters, dict(request.headers), response.status_code) + log_prompt(client_ip, token, request_json_body['prompt'], backend_response, elapsed_time, parameters, dict(request.headers), response.status_code) return jsonify({ 'code': 500, 'error': 'the backend did not return valid JSON',