check_nvidia: add power state
This commit is contained in:
parent
5665bd21b0
commit
8ea97d8227
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import nagiosplugin
|
||||
|
@ -170,13 +171,29 @@ class GPUThrottleReasons(nagiosplugin.Resource):
|
|||
raise nagiosplugin.CheckError("Unknown throttle reason")
|
||||
|
||||
|
||||
class GPUPowerState(nagiosplugin.Resource):
|
||||
def __init__(self, gpu_index):
|
||||
self.gpu_index = gpu_index
|
||||
|
||||
def probe(self):
|
||||
try:
|
||||
result = subprocess.run(['nvidia-smi', '--query-gpu=pstate', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
||||
capture_output=True, text=True, check=True)
|
||||
power_state_str = result.stdout.strip()
|
||||
power_state_int = int(re.match(r'P([0-9]*)', power_state_str).group(1))
|
||||
return nagiosplugin.Metric('power_state', power_state_int, context='power_state')
|
||||
except subprocess.CalledProcessError:
|
||||
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
||||
|
||||
|
||||
class GPUSummary(nagiosplugin.Summary):
|
||||
def ok(self, results):
|
||||
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
|
||||
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
|
||||
results['gpu_name'].metric.value,
|
||||
results['memory_util'].metric.value,
|
||||
results['used_memory'].metric.value,
|
||||
results['total_memory'].metric.value,
|
||||
results['power_state'].metric.value,
|
||||
results['temperature'].metric.value,
|
||||
results['fan_speed'].metric.value,
|
||||
results['power_usage'].metric.value,
|
||||
|
@ -270,6 +287,8 @@ def main():
|
|||
nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'),
|
||||
GPUThrottleReasons(args.gpu_index),
|
||||
nagiosplugin.Context('throttle_reasons'),
|
||||
GPUPowerState(args.gpu_index),
|
||||
nagiosplugin.ScalarContext('power_state'),
|
||||
GPUSummary()
|
||||
)
|
||||
check.main()
|
||||
|
|
Loading…
Reference in New Issue