local-llm-server/llm_server/llm/vllm/generate.py

49 lines
2.0 KiB
Python
Raw Normal View History

2023-09-11 20:47:19 -06:00
"""
This file is used by the worker that processes requests.
"""
import requests
2024-05-07 12:20:53 -06:00
from llm_server.config.global_config import GlobalConfig
2023-09-11 20:47:19 -06:00
# TODO: make the VLMM backend return TPS and time elapsed
# https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py
def prepare_json(json_data: dict):
# Convert back to VLLM.
json_data['max_tokens'] = json_data.pop('max_new_tokens')
2023-09-11 20:47:19 -06:00
return json_data
def transform_prompt_to_text(prompt: list):
text = ''
for item in prompt:
text += item['content'] + '\n'
return text.strip('\n')
def handle_blocking_request(json_data: dict, cluster_backend, timeout: int = 10):
2023-09-11 20:47:19 -06:00
try:
2024-05-07 12:20:53 -06:00
r = requests.post(f'{cluster_backend}/generate', json=prepare_json(json_data), verify=GlobalConfig.get().verify_ssl, timeout=GlobalConfig.get().backend_generate_request_timeout if not timeout else timeout)
except requests.exceptions.ReadTimeout:
# print(f'Failed to reach VLLM inference endpoint - request to backend timed out')
return False, None, 'Request to backend timed out'
2023-09-11 20:47:19 -06:00
except Exception as e:
# print(f'Failed to reach VLLM inference endpoint -', f'{e.__class__.__name__}: {e}')
return False, None, f'Request to backend encountered error -- {e.__class__.__name__}: {e}'
if r.status_code != 200:
# print(f'Failed to reach VLLM inference endpoint - got code {r.status_code}')
return False, r, f'Backend returned {r.status_code}'
return True, r, None
2023-09-11 20:47:19 -06:00
def generate(json_data: dict, cluster_backend, timeout: int = None):
2023-09-11 20:47:19 -06:00
if json_data.get('stream'):
2023-09-27 14:36:49 -06:00
try:
2024-05-07 12:20:53 -06:00
return requests.post(f'{cluster_backend}/generate', json=prepare_json(json_data), stream=True, verify=GlobalConfig.get().verify_ssl, timeout=GlobalConfig.get().backend_generate_request_timeout if not timeout else timeout)
2023-09-27 14:36:49 -06:00
except Exception as e:
return False
2023-09-11 20:47:19 -06:00
else:
return handle_blocking_request(json_data, cluster_backend, timeout=timeout)