check_nvidia: remove fan scheck, adjust output, report temp in C
This commit is contained in:
parent
3abdce195f
commit
8f6d164aea
|
@ -1,10 +1,9 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import argparse
|
import argparse
|
||||||
|
import nagiosplugin
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
import nagiosplugin
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor
|
Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor
|
||||||
Not implemented:
|
Not implemented:
|
||||||
|
@ -33,8 +32,8 @@ class GPUTemp(nagiosplugin.Resource):
|
||||||
result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
||||||
capture_output=True, text=True, check=True)
|
capture_output=True, text=True, check=True)
|
||||||
temp_celsius = float(result.stdout.strip())
|
temp_celsius = float(result.stdout.strip())
|
||||||
temp_fahrenheit = (temp_celsius * 9 / 5) + 32
|
# temp_fahrenheit = (temp_celsius * 9 / 5) + 32
|
||||||
return nagiosplugin.Metric('temperature', temp_fahrenheit, uom='F', context='temperature')
|
return nagiosplugin.Metric('temperature', temp_celsius, uom='C', context='temperature')
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -162,9 +161,9 @@ class GPUThrottleReasons(nagiosplugin.Resource):
|
||||||
active_throttle_reasons = []
|
active_throttle_reasons = []
|
||||||
for i, reason in enumerate(self.throttle_reasons.keys()):
|
for i, reason in enumerate(self.throttle_reasons.keys()):
|
||||||
if i < len(throttle_data) and throttle_data[i] == "Active":
|
if i < len(throttle_data) and throttle_data[i] == "Active":
|
||||||
active_throttle_reasons.append(f"{reason}: {self.explanations[reason]}")
|
active_throttle_reasons.append(f"{reason} ({self.explanations[reason]})")
|
||||||
|
|
||||||
return nagiosplugin.Metric('throttle_reasons', '\n'.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons')
|
return nagiosplugin.Metric('throttle_reasons', '; '.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons')
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -188,14 +187,14 @@ class GPUPowerState(nagiosplugin.Resource):
|
||||||
|
|
||||||
class GPUSummary(nagiosplugin.Summary):
|
class GPUSummary(nagiosplugin.Summary):
|
||||||
def ok(self, results):
|
def ok(self, results):
|
||||||
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
|
return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format(
|
||||||
results['gpu_name'].metric.value,
|
results['gpu_name'].metric.value,
|
||||||
results['memory_util'].metric.value,
|
results['memory_util'].metric.value,
|
||||||
results['used_memory'].metric.value,
|
results['used_memory'].metric.value,
|
||||||
results['total_memory'].metric.value,
|
results['total_memory'].metric.value,
|
||||||
results['power_state'].metric.value,
|
results['power_state'].metric.value,
|
||||||
results['temperature'].metric.value,
|
results['temperature'].metric.value,
|
||||||
results['fan_speed'].metric.value,
|
# results['fan_speed'].metric.value, # Fan Speed: {:.1f}%
|
||||||
results['power_usage'].metric.value,
|
results['power_usage'].metric.value,
|
||||||
results['pcie_link_gen'].metric.value,
|
results['pcie_link_gen'].metric.value,
|
||||||
results['pcie_link_width'].metric.value
|
results['pcie_link_width'].metric.value
|
||||||
|
@ -214,10 +213,10 @@ class GPUSummary(nagiosplugin.Summary):
|
||||||
elif results['memory_util'].state == nagiosplugin.state.Warn:
|
elif results['memory_util'].state == nagiosplugin.state.Warn:
|
||||||
problem_parts.append("Memory utilization is high")
|
problem_parts.append("Memory utilization is high")
|
||||||
|
|
||||||
if results['fan_speed'].state == nagiosplugin.state.Critical:
|
# if results['fan_speed'].state == nagiosplugin.state.Critical:
|
||||||
problem_parts.append("Fan speed is critically low")
|
# problem_parts.append("Fan speed is critically low")
|
||||||
elif results['fan_speed'].state == nagiosplugin.state.Warn:
|
# elif results['fan_speed'].state == nagiosplugin.state.Warn:
|
||||||
problem_parts.append("Fan speed is low")
|
# problem_parts.append("Fan speed is low")
|
||||||
|
|
||||||
if results['power_usage'].state == nagiosplugin.state.Critical:
|
if results['power_usage'].state == nagiosplugin.state.Critical:
|
||||||
problem_parts.append("Power usage is critically high")
|
problem_parts.append("Power usage is critically high")
|
||||||
|
@ -245,18 +244,18 @@ def main():
|
||||||
argp = argparse.ArgumentParser(description="Check NVIDIA GPU temperature and memory utilization")
|
argp = argparse.ArgumentParser(description="Check NVIDIA GPU temperature and memory utilization")
|
||||||
argp.add_argument('-i', '--gpu-index', metavar='INDEX', default='0',
|
argp.add_argument('-i', '--gpu-index', metavar='INDEX', default='0',
|
||||||
help='index of the GPU to check (default: 0)')
|
help='index of the GPU to check (default: 0)')
|
||||||
argp.add_argument('-w', '--warning', metavar='RANGE', default='0:175',
|
argp.add_argument('-w', '--warning', metavar='RANGE', default='0:80',
|
||||||
help='warning threshold temperature in Fahrenheit')
|
help='warning threshold temperature in Celsius')
|
||||||
argp.add_argument('-c', '--critical', metavar='RANGE', default='0:194',
|
argp.add_argument('-c', '--critical', metavar='RANGE', default='0:90',
|
||||||
help='critical threshold temperature in Fahrenheit')
|
help='critical threshold temperature in Celsius')
|
||||||
argp.add_argument('-mw', '--memory-warning', metavar='RANGE', default='0:95',
|
argp.add_argument('-mw', '--memory-warning', metavar='RANGE', default='0:95',
|
||||||
help='warning threshold memory utilization in percentage')
|
help='warning threshold memory utilization in percentage')
|
||||||
argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99',
|
argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99',
|
||||||
help='critical threshold memory utilization in percentage')
|
help='critical threshold memory utilization in percentage')
|
||||||
argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80',
|
# argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80',
|
||||||
help='warning threshold fan speed in percentage')
|
# help='warning threshold fan speed in percentage')
|
||||||
argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95',
|
# argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95',
|
||||||
help='critical threshold fan speed in percentage')
|
# help='critical threshold fan speed in percentage')
|
||||||
argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150',
|
argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150',
|
||||||
help='warning threshold power usage in watts')
|
help='warning threshold power usage in watts')
|
||||||
argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200',
|
argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200',
|
||||||
|
@ -278,8 +277,8 @@ def main():
|
||||||
nagiosplugin.ScalarContext('total_memory'),
|
nagiosplugin.ScalarContext('total_memory'),
|
||||||
GPUName(args.gpu_index),
|
GPUName(args.gpu_index),
|
||||||
nagiosplugin.Context('gpu_name'),
|
nagiosplugin.Context('gpu_name'),
|
||||||
GPUFanSpeed(args.gpu_index),
|
# GPUFanSpeed(args.gpu_index),
|
||||||
nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical),
|
# nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical),
|
||||||
GPUPowerUsage(args.gpu_index),
|
GPUPowerUsage(args.gpu_index),
|
||||||
nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical),
|
nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical),
|
||||||
GPUPCIeLink(args.gpu_index),
|
GPUPCIeLink(args.gpu_index),
|
||||||
|
|
Loading…
Reference in New Issue