check_nvidia: add power state

This commit is contained in:
Cyberes 2024-11-08 22:03:44 -07:00
parent 5665bd21b0
commit 8ea97d8227
1 changed files with 20 additions and 1 deletions

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import re
import subprocess import subprocess
import nagiosplugin import nagiosplugin
@ -170,13 +171,29 @@ class GPUThrottleReasons(nagiosplugin.Resource):
raise nagiosplugin.CheckError("Unknown throttle reason") raise nagiosplugin.CheckError("Unknown throttle reason")
class GPUPowerState(nagiosplugin.Resource):
def __init__(self, gpu_index):
self.gpu_index = gpu_index
def probe(self):
try:
result = subprocess.run(['nvidia-smi', '--query-gpu=pstate', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
capture_output=True, text=True, check=True)
power_state_str = result.stdout.strip()
power_state_int = int(re.match(r'P([0-9]*)', power_state_str).group(1))
return nagiosplugin.Metric('power_state', power_state_int, context='power_state')
except subprocess.CalledProcessError:
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
class GPUSummary(nagiosplugin.Summary): class GPUSummary(nagiosplugin.Summary):
def ok(self, results): def ok(self, results):
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format( return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
results['gpu_name'].metric.value, results['gpu_name'].metric.value,
results['memory_util'].metric.value, results['memory_util'].metric.value,
results['used_memory'].metric.value, results['used_memory'].metric.value,
results['total_memory'].metric.value, results['total_memory'].metric.value,
results['power_state'].metric.value,
results['temperature'].metric.value, results['temperature'].metric.value,
results['fan_speed'].metric.value, results['fan_speed'].metric.value,
results['power_usage'].metric.value, results['power_usage'].metric.value,
@ -270,6 +287,8 @@ def main():
nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'), nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'),
GPUThrottleReasons(args.gpu_index), GPUThrottleReasons(args.gpu_index),
nagiosplugin.Context('throttle_reasons'), nagiosplugin.Context('throttle_reasons'),
GPUPowerState(args.gpu_index),
nagiosplugin.ScalarContext('power_state'),
GPUSummary() GPUSummary()
) )
check.main() check.main()