dynamically fetch GPUs for netdata
This commit is contained in:
parent
16b986c206
commit
0230ddda17
|
@ -5,18 +5,26 @@ import requests
|
|||
from llm_server import opts
|
||||
|
||||
|
||||
def get_power_state():
|
||||
url = f"{opts.netdata_root}/api/v1/data?chart={opts.netdata_metric}"
|
||||
try:
|
||||
response = requests.get(url, timeout=3)
|
||||
data = json.loads(response.text)
|
||||
power_state_data = data['data'][0]
|
||||
power_state = None
|
||||
for i in range(1, len(power_state_data)):
|
||||
if power_state_data[i] == 1:
|
||||
power_state = data['labels'][i]
|
||||
def get_power_states():
|
||||
gpu_num = 0
|
||||
output = {}
|
||||
while True:
|
||||
url = f"{opts.netdata_root}/api/v1/data?chart=nvidia_smi.gpu{gpu_num}_power_state"
|
||||
try:
|
||||
response = requests.get(url, timeout=3)
|
||||
if response.status_code != 200:
|
||||
break
|
||||
return power_state
|
||||
except Exception as e:
|
||||
print('Failed to fetch Netdata metrics:', e)
|
||||
return None
|
||||
data = json.loads(response.text)
|
||||
power_state_data = data['data'][0]
|
||||
power_state = None
|
||||
for i in range(1, len(power_state_data)):
|
||||
if power_state_data[i] == 1:
|
||||
power_state = data['labels'][i]
|
||||
break
|
||||
output[f'gpu{gpu_num}'] = int(power_state.lower().strip('p'))
|
||||
print(output)
|
||||
except Exception as e:
|
||||
print('Failed to fetch Netdata metrics:', e)
|
||||
return output
|
||||
gpu_num += 1
|
||||
return output
|
||||
|
|
|
@ -18,5 +18,4 @@ show_num_prompts = True
|
|||
show_uptime = True
|
||||
average_generation_time_mode = 'database'
|
||||
show_total_output_tokens = True
|
||||
netdata_root = None
|
||||
netdata_metric = None
|
||||
netdata_root = None
|
|
@ -5,7 +5,7 @@ from llm_server import opts
|
|||
from llm_server.database import sum_column
|
||||
from llm_server.helpers import deep_sort
|
||||
from llm_server.llm.info import get_running_model
|
||||
from llm_server.netdata import get_power_state
|
||||
from llm_server.netdata import get_power_states
|
||||
from llm_server.routes.cache import redis
|
||||
from llm_server.routes.queue import priority_queue
|
||||
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
|
||||
|
@ -42,11 +42,7 @@ def generate_stats():
|
|||
raise Exception
|
||||
|
||||
if opts.netdata_root:
|
||||
netdata_stats = {
|
||||
'gpu0': {
|
||||
'power_state': int(get_power_state().lower().strip('p'))
|
||||
}
|
||||
}
|
||||
netdata_stats = get_power_states()
|
||||
else:
|
||||
netdata_stats = {}
|
||||
|
||||
|
|
|
@ -53,7 +53,6 @@ opts.show_uptime = config['show_uptime']
|
|||
opts.backend_url = config['backend_url'].strip('/')
|
||||
opts.show_total_output_tokens = config['show_total_output_tokens']
|
||||
opts.netdata_root = config['netdata_root']
|
||||
opts.netdata_metric = config['netdata_metric']
|
||||
|
||||
opts.verify_ssl = config['verify_ssl']
|
||||
if not opts.verify_ssl:
|
||||
|
|
Reference in New Issue