track nvidia power states through netdata

This commit is contained in:
Cyberes 2023-08-24 21:36:00 -06:00
parent 01b8442b95
commit 16b986c206
4 changed files with 37 additions and 0 deletions

22
llm_server/netdata.py Normal file
View File

@ -0,0 +1,22 @@
import json
import requests
from llm_server import opts
def get_power_state():
url = f"{opts.netdata_root}/api/v1/data?chart={opts.netdata_metric}"
try:
response = requests.get(url, timeout=3)
data = json.loads(response.text)
power_state_data = data['data'][0]
power_state = None
for i in range(1, len(power_state_data)):
if power_state_data[i] == 1:
power_state = data['labels'][i]
break
return power_state
except Exception as e:
print('Failed to fetch Netdata metrics:', e)
return None

View File

@ -18,3 +18,5 @@ show_num_prompts = True
show_uptime = True show_uptime = True
average_generation_time_mode = 'database' average_generation_time_mode = 'database'
show_total_output_tokens = True show_total_output_tokens = True
netdata_root = None
netdata_metric = None

View File

@ -5,6 +5,7 @@ from llm_server import opts
from llm_server.database import sum_column from llm_server.database import sum_column
from llm_server.helpers import deep_sort from llm_server.helpers import deep_sort
from llm_server.llm.info import get_running_model from llm_server.llm.info import get_running_model
from llm_server.netdata import get_power_state
from llm_server.routes.cache import redis from llm_server.routes.cache import redis
from llm_server.routes.queue import priority_queue from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
@ -40,6 +41,15 @@ def generate_stats():
else: else:
raise Exception raise Exception
if opts.netdata_root:
netdata_stats = {
'gpu0': {
'power_state': int(get_power_state().lower().strip('p'))
}
}
else:
netdata_stats = {}
output = { output = {
'stats': { 'stats': {
'proompts_in_queue': proompters_in_queue, 'proompts_in_queue': proompters_in_queue,
@ -49,6 +59,7 @@ def generate_stats():
'average_generation_elapsed_sec': average_generation_time, 'average_generation_elapsed_sec': average_generation_time,
'average_tps': average_tps, 'average_tps': average_tps,
'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None, 'tokens_generated': sum_column('prompts', 'response_tokens') if opts.show_total_output_tokens else None,
'nvidia': netdata_stats
}, },
'online': online, 'online': online,
'endpoints': { 'endpoints': {

View File

@ -52,6 +52,8 @@ opts.show_num_prompts = config['show_num_prompts']
opts.show_uptime = config['show_uptime'] opts.show_uptime = config['show_uptime']
opts.backend_url = config['backend_url'].strip('/') opts.backend_url = config['backend_url'].strip('/')
opts.show_total_output_tokens = config['show_total_output_tokens'] opts.show_total_output_tokens = config['show_total_output_tokens']
opts.netdata_root = config['netdata_root']
opts.netdata_metric = config['netdata_metric']
opts.verify_ssl = config['verify_ssl'] opts.verify_ssl = config['verify_ssl']
if not opts.verify_ssl: if not opts.verify_ssl: