dynamically fetch GPUs for netdata

This commit is contained in:
Cyberes 2023-08-24 21:56:15 -06:00
parent 16b986c206
commit 0230ddda17
4 changed files with 25 additions and 23 deletions

View File

@ -5,18 +5,26 @@ import requests
from llm_server import opts from llm_server import opts
def get_power_state(): def get_power_states():
url = f"{opts.netdata_root}/api/v1/data?chart={opts.netdata_metric}" gpu_num = 0
try: output = {}
response = requests.get(url, timeout=3) while True:
data = json.loads(response.text) url = f"{opts.netdata_root}/api/v1/data?chart=nvidia_smi.gpu{gpu_num}_power_state"
power_state_data = data['data'][0] try:
power_state = None response = requests.get(url, timeout=3)
for i in range(1, len(power_state_data)): if response.status_code != 200:
if power_state_data[i] == 1:
power_state = data['labels'][i]
break break
return power_state data = json.loads(response.text)
except Exception as e: power_state_data = data['data'][0]
print('Failed to fetch Netdata metrics:', e) power_state = None
return None for i in range(1, len(power_state_data)):
if power_state_data[i] == 1:
power_state = data['labels'][i]
break
output[f'gpu{gpu_num}'] = int(power_state.lower().strip('p'))
print(output)
except Exception as e:
print('Failed to fetch Netdata metrics:', e)
return output
gpu_num += 1
return output

View File

@ -18,5 +18,4 @@ show_num_prompts = True
show_uptime = True show_uptime = True
average_generation_time_mode = 'database' average_generation_time_mode = 'database'
show_total_output_tokens = True show_total_output_tokens = True
netdata_root = None netdata_root = None
netdata_metric = None

View File

@ -5,7 +5,7 @@ from llm_server import opts
from llm_server.database import sum_column from llm_server.database import sum_column
from llm_server.helpers import deep_sort from llm_server.helpers import deep_sort
from llm_server.llm.info import get_running_model from llm_server.llm.info import get_running_model
from llm_server.netdata import get_power_state from llm_server.netdata import get_power_states
from llm_server.routes.cache import redis from llm_server.routes.cache import redis
from llm_server.routes.queue import priority_queue from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
@ -42,11 +42,7 @@ def generate_stats():
raise Exception raise Exception
if opts.netdata_root: if opts.netdata_root:
netdata_stats = { netdata_stats = get_power_states()
'gpu0': {
'power_state': int(get_power_state().lower().strip('p'))
}
}
else: else:
netdata_stats = {} netdata_stats = {}

View File

@ -53,7 +53,6 @@ opts.show_uptime = config['show_uptime']
opts.backend_url = config['backend_url'].strip('/') opts.backend_url = config['backend_url'].strip('/')
opts.show_total_output_tokens = config['show_total_output_tokens'] opts.show_total_output_tokens = config['show_total_output_tokens']
opts.netdata_root = config['netdata_root'] opts.netdata_root = config['netdata_root']
opts.netdata_metric = config['netdata_metric']
opts.verify_ssl = config['verify_ssl'] opts.verify_ssl = config['verify_ssl']
if not opts.verify_ssl: if not opts.verify_ssl: