From 0230ddda176d1b2c3cf80272de15ee5266e24a35 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Thu, 24 Aug 2023 21:56:15 -0600 Subject: [PATCH] dynamically fetch GPUs for netdata --- llm_server/netdata.py | 36 ++++++++++++++++---------- llm_server/opts.py | 3 +-- llm_server/routes/v1/generate_stats.py | 8 ++---- server.py | 1 - 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/llm_server/netdata.py b/llm_server/netdata.py index 1efe013..36ce084 100644 --- a/llm_server/netdata.py +++ b/llm_server/netdata.py @@ -5,18 +5,26 @@ import requests from llm_server import opts -def get_power_state(): - url = f"{opts.netdata_root}/api/v1/data?chart={opts.netdata_metric}" - try: - response = requests.get(url, timeout=3) - data = json.loads(response.text) - power_state_data = data['data'][0] - power_state = None - for i in range(1, len(power_state_data)): - if power_state_data[i] == 1: - power_state = data['labels'][i] +def get_power_states(): + gpu_num = 0 + output = {} + while True: + url = f"{opts.netdata_root}/api/v1/data?chart=nvidia_smi.gpu{gpu_num}_power_state" + try: + response = requests.get(url, timeout=3) + if response.status_code != 200: break - return power_state - except Exception as e: - print('Failed to fetch Netdata metrics:', e) - return None + data = json.loads(response.text) + power_state_data = data['data'][0] + power_state = None + for i in range(1, len(power_state_data)): + if power_state_data[i] == 1: + power_state = data['labels'][i] + break + output[f'gpu{gpu_num}'] = int(power_state.lower().strip('p')) + print(output) + except Exception as e: + print('Failed to fetch Netdata metrics:', e) + return output + gpu_num += 1 + return output diff --git a/llm_server/opts.py b/llm_server/opts.py index bda9b8b..476545b 100644 --- a/llm_server/opts.py +++ b/llm_server/opts.py @@ -18,5 +18,4 @@ show_num_prompts = True show_uptime = True average_generation_time_mode = 'database' show_total_output_tokens = True -netdata_root = None -netdata_metric = None +netdata_root = None \ No newline at end of file diff --git a/llm_server/routes/v1/generate_stats.py b/llm_server/routes/v1/generate_stats.py index ca47f07..95e5bba 100644 --- a/llm_server/routes/v1/generate_stats.py +++ b/llm_server/routes/v1/generate_stats.py @@ -5,7 +5,7 @@ from llm_server import opts from llm_server.database import sum_column from llm_server.helpers import deep_sort from llm_server.llm.info import get_running_model -from llm_server.netdata import get_power_state +from llm_server.netdata import get_power_states from llm_server.routes.cache import redis from llm_server.routes.queue import priority_queue from llm_server.routes.stats import SemaphoreCheckerThread, calculate_avg_gen_time, get_active_gen_workers, get_total_proompts, server_start_time @@ -42,11 +42,7 @@ def generate_stats(): raise Exception if opts.netdata_root: - netdata_stats = { - 'gpu0': { - 'power_state': int(get_power_state().lower().strip('p')) - } - } + netdata_stats = get_power_states() else: netdata_stats = {} diff --git a/server.py b/server.py index 6e77341..0855a2a 100644 --- a/server.py +++ b/server.py @@ -53,7 +53,6 @@ opts.show_uptime = config['show_uptime'] opts.backend_url = config['backend_url'].strip('/') opts.show_total_output_tokens = config['show_total_output_tokens'] opts.netdata_root = config['netdata_root'] -opts.netdata_metric = config['netdata_metric'] opts.verify_ssl = config['verify_ssl'] if not opts.verify_ssl: