""" This file is used by the worker that processes requests. """ import requests from llm_server.config.global_config import GlobalConfig # TODO: make the VLMM backend return TPS and time elapsed # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py def prepare_json(json_data: dict): # Convert back to VLLM. json_data['max_tokens'] = json_data.pop('max_new_tokens') return json_data def transform_prompt_to_text(prompt: list): text = '' for item in prompt: text += item['content'] + '\n' return text.strip('\n') def handle_blocking_request(json_data: dict, cluster_backend, timeout: int = 10): try: r = requests.post(f'{cluster_backend}/generate', json=prepare_json(json_data), verify=GlobalConfig.get().verify_ssl, timeout=GlobalConfig.get().backend_generate_request_timeout if not timeout else timeout) except requests.exceptions.ReadTimeout: # print(f'Failed to reach VLLM inference endpoint - request to backend timed out') return False, None, 'Request to backend timed out' except Exception as e: # print(f'Failed to reach VLLM inference endpoint -', f'{e.__class__.__name__}: {e}') return False, None, f'Request to backend encountered error -- {e.__class__.__name__}: {e}' if r.status_code != 200: # print(f'Failed to reach VLLM inference endpoint - got code {r.status_code}') return False, r, f'Backend returned {r.status_code}' return True, r, None def generate(json_data: dict, cluster_backend, timeout: int = None): if json_data.get('stream'): try: return requests.post(f'{cluster_backend}/generate', json=prepare_json(json_data), stream=True, verify=GlobalConfig.get().verify_ssl, timeout=GlobalConfig.get().backend_generate_request_timeout if not timeout else timeout) except Exception as e: return False else: return handle_blocking_request(json_data, cluster_backend, timeout=timeout)