diff --git a/check_nvidia.py b/check_nvidia.py index 80a8ddb..2654212 100755 --- a/check_nvidia.py +++ b/check_nvidia.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import argparse +import re import subprocess import nagiosplugin @@ -170,13 +171,29 @@ class GPUThrottleReasons(nagiosplugin.Resource): raise nagiosplugin.CheckError("Unknown throttle reason") +class GPUPowerState(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=pstate', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + power_state_str = result.stdout.strip() + power_state_int = int(re.match(r'P([0-9]*)', power_state_str).group(1)) + return nagiosplugin.Metric('power_state', power_state_int, context='power_state') + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + + class GPUSummary(nagiosplugin.Summary): def ok(self, results): - return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format( + return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format( results['gpu_name'].metric.value, results['memory_util'].metric.value, results['used_memory'].metric.value, results['total_memory'].metric.value, + results['power_state'].metric.value, results['temperature'].metric.value, results['fan_speed'].metric.value, results['power_usage'].metric.value, @@ -270,6 +287,8 @@ def main(): nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'), GPUThrottleReasons(args.gpu_index), nagiosplugin.Context('throttle_reasons'), + GPUPowerState(args.gpu_index), + nagiosplugin.ScalarContext('power_state'), GPUSummary() ) check.main()