adjust requests timeout, add service file
This commit is contained in:
parent
035c17c48b
commit
c45e68a8c8
|
@ -9,14 +9,14 @@ def get_running_model():
|
||||||
|
|
||||||
if opts.mode == 'oobabooga':
|
if opts.mode == 'oobabooga':
|
||||||
try:
|
try:
|
||||||
backend_response = requests.get(f'{opts.backend_url}/api/v1/model', timeout=3, verify=opts.verify_ssl)
|
backend_response = requests.get(f'{opts.backend_url}/api/v1/model', timeout=10, verify=opts.verify_ssl)
|
||||||
r_json = backend_response.json()
|
r_json = backend_response.json()
|
||||||
return r_json['result'], None
|
return r_json['result'], None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False, e
|
return False, e
|
||||||
elif opts.mode == 'vllm':
|
elif opts.mode == 'vllm':
|
||||||
try:
|
try:
|
||||||
backend_response = requests.get(f'{opts.backend_url}/model', timeout=3, verify=opts.verify_ssl)
|
backend_response = requests.get(f'{opts.backend_url}/model', timeout=10, verify=opts.verify_ssl)
|
||||||
r_json = backend_response.json()
|
r_json = backend_response.json()
|
||||||
return r_json['model'], None
|
return r_json['model'], None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from llm_server import opts
|
||||||
|
|
||||||
def generate(json_data: dict):
|
def generate(json_data: dict):
|
||||||
try:
|
try:
|
||||||
r = requests.post(f'{opts.backend_url}/api/v1/generate', json=json_data, verify=opts.verify_ssl)
|
r = requests.post(f'{opts.backend_url}/api/v1/generate', json=json_data, verify=opts.verify_ssl, timeout=120)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False, None, f'{e.__class__.__name__}: {e}'
|
return False, None, f'{e.__class__.__name__}: {e}'
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
|
|
|
@ -79,7 +79,7 @@ def transform_prompt_to_text(prompt: list):
|
||||||
|
|
||||||
def handle_blocking_request(json_data: dict):
|
def handle_blocking_request(json_data: dict):
|
||||||
try:
|
try:
|
||||||
r = requests.post(f'{opts.backend_url}/generate', json=prepare_json(json_data), verify=opts.verify_ssl)
|
r = requests.post(f'{opts.backend_url}/generate', json=prepare_json(json_data), verify=opts.verify_ssl, timeout=120)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False, None, f'{e.__class__.__name__}: {e}'
|
return False, None, f'{e.__class__.__name__}: {e}'
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ def get_gpu_wh(gpu_id: int):
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"options": "absolute|jsonwrap"
|
"options": "absolute|jsonwrap"
|
||||||
}
|
}
|
||||||
response = requests.get(f'{opts.netdata_root}/api/v1/data', params=params)
|
response = requests.get(f'{opts.netdata_root}/api/v1/data', params=params, timeout=10)
|
||||||
data = json.loads(response.text)
|
data = json.loads(response.text)
|
||||||
total_power_usage_watts = sum(point[1] for point in data['result']['data'])
|
total_power_usage_watts = sum(point[1] for point in data['result']['data'])
|
||||||
# total_power_usage_watt_hours = round(total_power_usage_watts / 3600, 1)
|
# total_power_usage_watt_hours = round(total_power_usage_watts / 3600, 1)
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
[Unit]
|
||||||
|
Description=VLLM Backend
|
||||||
|
Wants=basic.target
|
||||||
|
After=basic.target network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=USERNAME
|
||||||
|
Group=USERNAME
|
||||||
|
# Can add --disable-log-requests when I know the backend won't crash
|
||||||
|
ExecStart=/storage/vllm/venv/bin/python /storage/vllm/api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576
|
||||||
|
Restart=always
|
||||||
|
RestartSec=2
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
|
@ -12,8 +12,6 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.utils import random_uuid
|
from vllm.utils import random_uuid
|
||||||
|
|
||||||
# python api_server.py --model /storage/oobabooga/one-click-installers/text-generation-webui/models/TheBloke_MythoMax-L2-13B-GPTQ/ --host 0.0.0.0 --port 7000 --max-num-batched-tokens 24576
|
|
||||||
|
|
||||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||||
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
|
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
Reference in New Issue