reorganize nvidia stats

This commit is contained in:
Cyberes 2023-08-25 15:02:40 -06:00
parent c6edeb2b70
commit d64152587c
3 changed files with 34 additions and 4 deletions

View File

@ -1,4 +1,5 @@
import json import json
from datetime import datetime, timedelta
import requests import requests
@ -26,4 +27,26 @@ def get_power_states():
print('Failed to fetch Netdata metrics:', e) print('Failed to fetch Netdata metrics:', e)
return output return output
gpu_num += 1 gpu_num += 1
return {'power_states': output} return output
def get_gpu_wh(gpu_id: int):
chart_name = f"nvidia_smi.gpu{gpu_id}_power"
now = datetime.now()
one_hour_ago = now - timedelta(hours=1)
num_seconds = int((now - one_hour_ago).total_seconds())
params = {
"chart": chart_name,
"after": int(one_hour_ago.timestamp()),
"before": int(now.timestamp()),
"points": num_seconds,
"group": "second",
"format": "json",
"options": "absolute|jsonwrap"
}
response = requests.get(f'{opts.netdata_root}/api/v1/data', params=params)
data = json.loads(response.text)
total_power_usage_watts = sum(point[1] for point in data['result']['data'])
# total_power_usage_watt_hours = round(total_power_usage_watts / 3600, 1)
total_power_usage_kwh = round(total_power_usage_watts / 1000 / 3600, 3)
return total_power_usage_kwh

View File

@ -18,4 +18,4 @@ show_num_prompts = True
show_uptime = True show_uptime = True
average_generation_time_mode = 'database' average_generation_time_mode = 'database'
show_total_output_tokens = True show_total_output_tokens = True
netdata_root = None netdata_root = None

View File

@ -5,7 +5,7 @@ from llm_server import opts
from llm_server.database import get_distinct_ips_24h, sum_column from llm_server.database import get_distinct_ips_24h, sum_column
from llm_server.helpers import deep_sort from llm_server.helpers import deep_sort
from llm_server.llm.info import get_running_model from llm_server.llm.info import get_running_model
from llm_server.netdata import get_power_states from llm_server.netdata import get_gpu_wh, get_power_states
from llm_server.routes.cache import redis from llm_server.routes.cache import redis
from llm_server.routes.queue import priority_queue from llm_server.routes.queue import priority_queue
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
@ -42,7 +42,14 @@ def generate_stats():
raise Exception raise Exception
if opts.netdata_root: if opts.netdata_root:
netdata_stats = get_power_states() netdata_stats = {}
power_states = get_power_states()
for gpu, power_state in power_states.items():
netdata_stats[gpu] = {
'power_state': power_state,
# 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))
}
else: else:
netdata_stats = {} netdata_stats = {}