diff --git a/llm_server/llm/info.py b/llm_server/llm/info.py index 0718dfd..b26093d 100644 --- a/llm_server/llm/info.py +++ b/llm_server/llm/info.py @@ -9,14 +9,14 @@ def get_running_model(): if opts.mode == 'oobabooga': try: - backend_response = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl) + backend_response = requests.get(f'{opts.backend_url}/api/v1/model', timeout=10, verify=opts.verify_ssl) r_json = backend_response.json() return r_json['result'], None except Exception as e: return False, e elif opts.mode == 'vllm': try: - backend_response = requests.get(f'{opts.backend_url}/model', timeout=3, verify=opts.verify_ssl) + backend_response = requests.get(f'{opts.backend_url}/model', timeout=10, verify=opts.verify_ssl) r_json = backend_response.json() return r_json['model'], None except Exception as e: diff --git a/llm_server/llm/oobabooga/generate.py b/llm_server/llm/oobabooga/generate.py index ffa83e5..c4736af 100644 --- a/llm_server/llm/oobabooga/generate.py +++ b/llm_server/llm/oobabooga/generate.py @@ -9,7 +9,7 @@ from llm_server import opts def generate(json_data: dict): try: - r = requests.post(f'{opts.backend_url}/api/v1/generate', json=json_data, verify=opts.verify_ssl) + r = requests.post(f'{opts.backend_url}/api/v1/generate', json=json_data, verify=opts.verify_ssl, timeout=120) except Exception as e: return False, None, f'{e.__class__.__name__}: {e}' if r.status_code != 200: diff --git a/llm_server/llm/vllm/generate.py b/llm_server/llm/vllm/generate.py index b740679..0a689d8 100644 --- a/llm_server/llm/vllm/generate.py +++ b/llm_server/llm/vllm/generate.py @@ -79,7 +79,7 @@ def transform_prompt_to_text(prompt: list): def handle_blocking_request(json_data: dict): try: - r = requests.post(f'{opts.backend_url}/generate', json=prepare_json(json_data), verify=opts.verify_ssl) + r = requests.post(f'{opts.backend_url}/generate', json=prepare_json(json_data), verify=opts.verify_ssl, timeout=120) except Exception as e: return False, None, f'{e.__class__.__name__}: {e}' diff --git a/llm_server/netdata.py b/llm_server/netdata.py index 44c68ea..d3abf32 100644 --- a/llm_server/netdata.py +++ b/llm_server/netdata.py @@ -44,7 +44,7 @@ def get_gpu_wh(gpu_id: int): "format": "json", "options": "absolute|jsonwrap" } - response = requests.get(f'{opts.netdata_root}/api/v1/data', params=params) + response = requests.get(f'{opts.netdata_root}/api/v1/data', params=params, timeout=10) data = json.loads(response.text) total_power_usage_watts = sum(point[1] for point in data['result']['data']) # total_power_usage_watt_hours = round(total_power_usage_watts / 3600, 1) diff --git a/other/vllm/vllm.service b/other/vllm/vllm.service new file mode 100644 index 0000000..d8da27c --- /dev/null +++ b/other/vllm/vllm.service @@ -0,0 +1,15 @@ +[Unit] +Description=VLLM Backend +Wants=basic.target +After=basic.target network.target + +[Service] +User=USERNAME +Group=USERNAME +# Can add --disable-log-requests when I know the backend won't crash +ExecStart=/storage/vllm/venv/bin/python /storage/vllm/api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576 +Restart=always +RestartSec=2 + +[Install] +WantedBy=multi-user.target diff --git a/other/vllm/vllm_api_server.py b/other/vllm/vllm_api_server.py index 98a1f7d..f5b5f45 100644 --- a/other/vllm/vllm_api_server.py +++ b/other/vllm/vllm_api_server.py @@ -12,8 +12,6 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid -# python api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576 - TIMEOUT_KEEP_ALIVE = 5 # seconds. TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. app = FastAPI()