reorganize nvidia stats
This commit is contained in:
parent
c6edeb2b70
commit
d64152587c
|
@ -1,4 +1,5 @@
|
||||||
import json
|
import json
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
@ -26,4 +27,26 @@ def get_power_states():
|
||||||
print('Failed to fetch Netdata metrics:', e)
|
print('Failed to fetch Netdata metrics:', e)
|
||||||
return output
|
return output
|
||||||
gpu_num += 1
|
gpu_num += 1
|
||||||
return {'power_states': output}
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_wh(gpu_id: int):
|
||||||
|
chart_name = f"nvidia_smi.gpu{gpu_id}_power"
|
||||||
|
now = datetime.now()
|
||||||
|
one_hour_ago = now - timedelta(hours=1)
|
||||||
|
num_seconds = int((now - one_hour_ago).total_seconds())
|
||||||
|
params = {
|
||||||
|
"chart": chart_name,
|
||||||
|
"after": int(one_hour_ago.timestamp()),
|
||||||
|
"before": int(now.timestamp()),
|
||||||
|
"points": num_seconds,
|
||||||
|
"group": "second",
|
||||||
|
"format": "json",
|
||||||
|
"options": "absolute|jsonwrap"
|
||||||
|
}
|
||||||
|
response = requests.get(f'{opts.netdata_root}/api/v1/data', params=params)
|
||||||
|
data = json.loads(response.text)
|
||||||
|
total_power_usage_watts = sum(point[1] for point in data['result']['data'])
|
||||||
|
# total_power_usage_watt_hours = round(total_power_usage_watts / 3600, 1)
|
||||||
|
total_power_usage_kwh = round(total_power_usage_watts / 1000 / 3600, 3)
|
||||||
|
return total_power_usage_kwh
|
||||||
|
|
|
@ -18,4 +18,4 @@ show_num_prompts = True
|
||||||
show_uptime = True
|
show_uptime = True
|
||||||
average_generation_time_mode = 'database'
|
average_generation_time_mode = 'database'
|
||||||
show_total_output_tokens = True
|
show_total_output_tokens = True
|
||||||
netdata_root = None
|
netdata_root = None
|
||||||
|
|
|
@ -5,7 +5,7 @@ from llm_server import opts
|
||||||
from llm_server.database import get_distinct_ips_24h, sum_column
|
from llm_server.database import get_distinct_ips_24h, sum_column
|
||||||
from llm_server.helpers import deep_sort
|
from llm_server.helpers import deep_sort
|
||||||
from llm_server.llm.info import get_running_model
|
from llm_server.llm.info import get_running_model
|
||||||
from llm_server.netdata import get_power_states
|
from llm_server.netdata import get_gpu_wh, get_power_states
|
||||||
from llm_server.routes.cache import redis
|
from llm_server.routes.cache import redis
|
||||||
from llm_server.routes.queue import priority_queue
|
from llm_server.routes.queue import priority_queue
|
||||||
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
|
from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time
|
||||||
|
@ -42,7 +42,14 @@ def generate_stats():
|
||||||
raise Exception
|
raise Exception
|
||||||
|
|
||||||
if opts.netdata_root:
|
if opts.netdata_root:
|
||||||
netdata_stats = get_power_states()
|
netdata_stats = {}
|
||||||
|
power_states = get_power_states()
|
||||||
|
for gpu, power_state in power_states.items():
|
||||||
|
netdata_stats[gpu] = {
|
||||||
|
'power_state': power_state,
|
||||||
|
# 'wh_wasted_1_hr': get_gpu_wh(int(gpu.strip('gpu')))
|
||||||
|
}
|
||||||
|
|
||||||
else:
|
else:
|
||||||
netdata_stats = {}
|
netdata_stats = {}
|
||||||
|
|
||||||
|
|
Reference in New Issue