log gen time to DB, also keep generation_elapsed under 3 min

This commit is contained in:
Cyberes 2023-08-23 22:20:39 -06:00
parent 3317bd5f1a
commit e52acb03a4
3 changed files with 27 additions and 17 deletions

View File

@ -23,6 +23,7 @@ def init_db():
response TEXT, response TEXT,
response_tokens INTEGER, response_tokens INTEGER,
response_status INTEGER, response_status INTEGER,
generation_time INTEGER,
parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)), parameters TEXT CHECK (parameters IS NULL OR json_valid(parameters)),
headers TEXT CHECK (headers IS NULL OR json_valid(headers)), headers TEXT CHECK (headers IS NULL OR json_valid(headers)),
timestamp INTEGER timestamp INTEGER
@ -43,7 +44,7 @@ def init_db():
conn.close() conn.close()
def log_prompt(ip, token, prompt, response, parameters, headers, backend_response_code): def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backend_response_code):
prompt_tokens = len(tokenizer.encode(prompt)) prompt_tokens = len(tokenizer.encode(prompt))
response_tokens = len(tokenizer.encode(response)) response_tokens = len(tokenizer.encode(response))
@ -53,8 +54,8 @@ def log_prompt(ip, token, prompt, response, parameters, headers, backend_respons
timestamp = int(time.time()) timestamp = int(time.time())
conn = sqlite3.connect(opts.database_path) conn = sqlite3.connect(opts.database_path)
c = conn.cursor() c = conn.cursor()
c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
(ip, token, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp)) (ip, token, prompt, prompt_tokens, response, response_tokens, backend_response_code, gen_time, json.dumps(parameters), json.dumps(headers), timestamp))
conn.commit() conn.commit()
conn.close() conn.close()

View File

@ -10,21 +10,22 @@ server_start_time = datetime.now()
# TODO: have a background thread put the averages in a variable so we don't end up with massive arrays # TODO: have a background thread put the averages in a variable so we don't end up with massive arrays
wait_in_queue_elapsed = [] # wait_in_queue_elapsed = []
wait_in_queue_elapsed_lock = Lock() # wait_in_queue_elapsed_lock = Lock()
generation_elapsed = [] generation_elapsed = []
generation_elapsed_lock = Lock() generation_elapsed_lock = Lock()
def elapsed_times_cleanup(): # TODO: do I need this?
global wait_in_queue_elapsed # def elapsed_times_cleanup():
while True: # global wait_in_queue_elapsed
current_time = time.time() # while True:
with wait_in_queue_elapsed_lock: # current_time = time.time()
global wait_in_queue_elapsed # with wait_in_queue_elapsed_lock:
wait_in_queue_elapsed = [(end_time, elapsed_time) for end_time, elapsed_time in wait_in_queue_elapsed if current_time - end_time <= 60] # global wait_in_queue_elapsed
time.sleep(1) # wait_in_queue_elapsed = [(end_time, elapsed_time) for end_time, elapsed_time in wait_in_queue_elapsed if current_time - end_time <= 60]
# time.sleep(1)
def calculate_avg_gen_time(): def calculate_avg_gen_time():
@ -37,10 +38,18 @@ def calculate_avg_gen_time():
def process_avg_gen_time(): def process_avg_gen_time():
global generation_elapsed
while True: while True:
with generation_elapsed_lock: with generation_elapsed_lock:
# Get the current time
current_time = time.time()
# Remove data older than 3 minutes
three_minutes_ago = current_time - 180
generation_elapsed[:] = [(end, elapsed) for end, elapsed in generation_elapsed if end >= three_minutes_ago]
# Get the data from the last minute # Get the data from the last minute
one_minute_ago = time.time() - 60 one_minute_ago = current_time - 60
recent_data = [elapsed for end, elapsed in generation_elapsed if end >= one_minute_ago] recent_data = [elapsed for end, elapsed in generation_elapsed if end >= one_minute_ago]
# Calculate the average # Calculate the average

View File

@ -76,7 +76,7 @@ def generate():
else: else:
raise Exception raise Exception
log_prompt(client_ip, token, request_json_body['prompt'], backend_response, parameters, dict(request.headers), response.status_code) log_prompt(client_ip, token, request_json_body['prompt'], backend_response, elapsed_time, parameters, dict(request.headers), response.status_code)
return jsonify({ return jsonify({
'code': 500, 'code': 500,
'error': 'failed to reach backend', 'error': 'failed to reach backend',
@ -95,7 +95,7 @@ def generate():
else: else:
raise Exception raise Exception
log_prompt(client_ip, token, request_json_body['prompt'], backend_response, parameters, dict(request.headers), response.status_code) log_prompt(client_ip, token, request_json_body['prompt'], backend_response, elapsed_time, parameters, dict(request.headers), response.status_code)
return jsonify({ return jsonify({
**response_json_body **response_json_body
}), 200 }), 200
@ -111,7 +111,7 @@ def generate():
} }
else: else:
raise Exception raise Exception
log_prompt(client_ip, token, request_json_body['prompt'], backend_response, parameters, dict(request.headers), response.status_code) log_prompt(client_ip, token, request_json_body['prompt'], backend_response, elapsed_time, parameters, dict(request.headers), response.status_code)
return jsonify({ return jsonify({
'code': 500, 'code': 500,
'error': 'the backend did not return valid JSON', 'error': 'the backend did not return valid JSON',