diff --git a/llm_server/llm/openai/oai_to_vllm.py b/llm_server/llm/openai/oai_to_vllm.py index 35c9f30..1490933 100644 --- a/llm_server/llm/openai/oai_to_vllm.py +++ b/llm_server/llm/openai/oai_to_vllm.py @@ -27,6 +27,8 @@ def oai_to_vllm(request_json_body, stop_hashes: bool, mode): if mode == 'vllm' and request_json_body.get('top_p') == 0: request_json_body['top_p'] = 0.01 + request_json_body['max_tokens'] = min(max(request_json_body.get('max_new_tokens', 0), request_json_body.get('max_tokens', 0)), opts.max_new_tokens) + return request_json_body diff --git a/llm_server/routes/request_handler.py b/llm_server/routes/request_handler.py index a048df7..dd8326b 100644 --- a/llm_server/routes/request_handler.py +++ b/llm_server/routes/request_handler.py @@ -37,6 +37,7 @@ class RequestHandler: self.parameters = None self.used = False + self.selected_model = selected_model self.backend_url = get_a_cluster_backend(selected_model) self.cluster_backend_info = cluster_config.get_backend(self.backend_url)