diff --git a/llm_server/llm/openai/oai_to_vllm.py b/llm_server/llm/openai/oai_to_vllm.py index 9111389..e224418 100644 --- a/llm_server/llm/openai/oai_to_vllm.py +++ b/llm_server/llm/openai/oai_to_vllm.py @@ -26,6 +26,7 @@ def oai_to_vllm(request_json_body, hashes: bool, mode): def format_oai_err(err_msg): + print('OAI ERROR MESSAGE:', err_msg) return jsonify({ "error": { "message": err_msg, diff --git a/llm_server/routes/openai_request_handler.py b/llm_server/routes/openai_request_handler.py index 8664695..0dfd558 100644 --- a/llm_server/routes/openai_request_handler.py +++ b/llm_server/routes/openai_request_handler.py @@ -25,6 +25,7 @@ class OpenAIRequestHandler(RequestHandler): self.prompt = None def handle_request(self) -> Tuple[flask.Response, int]: + print('recieved request') assert not self.used if opts.openai_silent_trim: @@ -66,11 +67,13 @@ class OpenAIRequestHandler(RequestHandler): model = self.request_json_body.get('model') if success: + print('sent success response') return self.build_openai_response(self.prompt, backend_response.json['results'][0]['text'], model=model), backend_response_status_code else: return backend_response, backend_response_status_code def handle_ratelimited(self, do_log: bool = True): + print('OAI ratelimited:', self.client_ip, self.request.headers) _, default_backend_info = get_model_choices() w = int(default_backend_info['estimated_wait']) if default_backend_info['estimated_wait'] > 0 else 2 response = jsonify({