diff --git a/llm_server/database.py b/llm_server/database.py index 214d618..994bead 100644 --- a/llm_server/database.py +++ b/llm_server/database.py @@ -74,7 +74,7 @@ def log_prompt(ip, token, prompt, response, gen_time, parameters, headers, backe timestamp = int(time.time()) conn = sqlite3.connect(opts.database_path) c = conn.cursor() - c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + c.execute("INSERT INTO prompts VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (ip, token, opts.running_model, opts.mode, opts.backend_url, request_url, gen_time, prompt, prompt_tokens, response, response_tokens, backend_response_code, json.dumps(parameters), json.dumps(headers), timestamp)) conn.commit() conn.close() diff --git a/llm_server/llm/llm_backend.py b/llm_server/llm/llm_backend.py index 2081ae8..d4dbecb 100644 --- a/llm_server/llm/llm_backend.py +++ b/llm_server/llm/llm_backend.py @@ -6,7 +6,7 @@ import flask class LLMBackend: default_params: dict - def handle_response(self, request: flask.Request, success, response: flask.Response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers): + def handle_response(self, success, request: flask.Request, response: flask.Response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers): raise NotImplementedError def validate_params(self, params_dict: dict) -> Tuple[bool, str | None]: diff --git a/llm_server/llm/oobabooga/ooba_backend.py b/llm_server/llm/oobabooga/ooba_backend.py index 49be068..28dda5e 100644 --- a/llm_server/llm/oobabooga/ooba_backend.py +++ b/llm_server/llm/oobabooga/ooba_backend.py @@ -9,7 +9,7 @@ from ...routes.helpers.http import validate_json class OobaboogaBackend(LLMBackend): - def handle_response(self, request, success, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers): + def handle_response(self, success, request, response, error_msg, client_ip, token, prompt, elapsed_time, parameters, headers): backend_err = False response_valid_json, response_json_body = validate_json(response) try: diff --git a/llm_server/llm/vllm/vllm_backend.py b/llm_server/llm/vllm/vllm_backend.py index 49a2a67..94f39a7 100644 --- a/llm_server/llm/vllm/vllm_backend.py +++ b/llm_server/llm/vllm/vllm_backend.py @@ -17,7 +17,7 @@ from llm_server.routes.helpers.http import validate_json class VLLMBackend(LLMBackend): default_params = vars(SamplingParams()) - def handle_response(self, request, success, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers): + def handle_response(self, success, request, response, error_msg, client_ip, token, prompt: str, elapsed_time, parameters, headers): response_valid_json, response_json_body = validate_json(response) backend_err = False try: @@ -41,7 +41,7 @@ class VLLMBackend(LLMBackend): # f'HTTP CODE {response_status_code}' # ) - log_prompt(client_ip, token, prompt, backend_response, elapsed_time if not backend_err else None, parameters, headers, response_status_code, request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err) + log_prompt(ip=client_ip, token=token, prompt=prompt, response=backend_response, gen_time=elapsed_time if not backend_err else None, parameters=parameters, headers=headers, backend_response_code=response_status_code, request_url=request.url, response_tokens=response_json_body.get('details', {}).get('generated_tokens'), is_error=backend_err) return jsonify({'results': [{'text': backend_response}]}), 200 else: backend_response = format_sillytavern_err(f'The backend did not return valid JSON.', 'error') diff --git a/llm_server/routes/openai_request_handler.py b/llm_server/routes/openai_request_handler.py index a360e71..a26e910 100644 --- a/llm_server/routes/openai_request_handler.py +++ b/llm_server/routes/openai_request_handler.py @@ -62,12 +62,12 @@ class OpenAIRequestHandler(RequestHandler): elapsed_time = end_time - self.start_time self.used = True - response, response_status_code = self.backend.handle_response(success, backend_response, error_msg, self.client_ip, self.token, self.prompt, elapsed_time, self.parameters, dict(self.request.headers)) + response, response_status_code = self.backend.handle_response(success=success, request=self.request, response=backend_response, error_msg=error_msg, client_ip=self.client_ip, token=self.token, prompt=self.prompt, elapsed_time=elapsed_time, parameters=self.parameters, headers=dict(self.request.headers)) return build_openai_response(self.prompt, response.json['results'][0]['text']), 200 def handle_ratelimited(self): backend_response = format_sillytavern_err(f'Ratelimited: you are only allowed to have {opts.simultaneous_requests_per_ip} simultaneous requests at a time. Please complete your other requests before sending another.', 'error') - log_prompt(self.client_ip, self.token, self.request_json_body.get('prompt', ''), backend_response, None, self.parameters, dict(self.request.headers), 429, self.request.url, is_error=True) + log_prompt(ip=self.client_ip, token=self.token, prompt=self.request_json_body.get('prompt', ''), response=backend_response, gen_time=None, parameters=self.parameters, headers=dict(self.request.headers), backend_response_code=429, request_url=self.request.url, is_error=True) return build_openai_response(self.prompt, backend_response), 200 def transform_messages_to_prompt(self):