check_nvidia: add power state
This commit is contained in:
parent
5665bd21b0
commit
8ea97d8227
|
@ -1,5 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import nagiosplugin
|
import nagiosplugin
|
||||||
|
@ -170,13 +171,29 @@ class GPUThrottleReasons(nagiosplugin.Resource):
|
||||||
raise nagiosplugin.CheckError("Unknown throttle reason")
|
raise nagiosplugin.CheckError("Unknown throttle reason")
|
||||||
|
|
||||||
|
|
||||||
|
class GPUPowerState(nagiosplugin.Resource):
|
||||||
|
def __init__(self, gpu_index):
|
||||||
|
self.gpu_index = gpu_index
|
||||||
|
|
||||||
|
def probe(self):
|
||||||
|
try:
|
||||||
|
result = subprocess.run(['nvidia-smi', '--query-gpu=pstate', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
||||||
|
capture_output=True, text=True, check=True)
|
||||||
|
power_state_str = result.stdout.strip()
|
||||||
|
power_state_int = int(re.match(r'P([0-9]*)', power_state_str).group(1))
|
||||||
|
return nagiosplugin.Metric('power_state', power_state_int, context='power_state')
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
||||||
|
|
||||||
|
|
||||||
class GPUSummary(nagiosplugin.Summary):
|
class GPUSummary(nagiosplugin.Summary):
|
||||||
def ok(self, results):
|
def ok(self, results):
|
||||||
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
|
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
|
||||||
results['gpu_name'].metric.value,
|
results['gpu_name'].metric.value,
|
||||||
results['memory_util'].metric.value,
|
results['memory_util'].metric.value,
|
||||||
results['used_memory'].metric.value,
|
results['used_memory'].metric.value,
|
||||||
results['total_memory'].metric.value,
|
results['total_memory'].metric.value,
|
||||||
|
results['power_state'].metric.value,
|
||||||
results['temperature'].metric.value,
|
results['temperature'].metric.value,
|
||||||
results['fan_speed'].metric.value,
|
results['fan_speed'].metric.value,
|
||||||
results['power_usage'].metric.value,
|
results['power_usage'].metric.value,
|
||||||
|
@ -270,6 +287,8 @@ def main():
|
||||||
nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'),
|
nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'),
|
||||||
GPUThrottleReasons(args.gpu_index),
|
GPUThrottleReasons(args.gpu_index),
|
||||||
nagiosplugin.Context('throttle_reasons'),
|
nagiosplugin.Context('throttle_reasons'),
|
||||||
|
GPUPowerState(args.gpu_index),
|
||||||
|
nagiosplugin.ScalarContext('power_state'),
|
||||||
GPUSummary()
|
GPUSummary()
|
||||||
)
|
)
|
||||||
check.main()
|
check.main()
|
||||||
|
|
Loading…
Reference in New Issue