check_nvidia: remove fan scheck, adjust output, report temp in C

This commit is contained in:
Cyberes 2024-11-26 21:20:53 -07:00
parent 3abdce195f
commit 68c923914e
1 changed files with 10 additions and 11 deletions

View File

@ -1,10 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import nagiosplugin
import re import re
import subprocess import subprocess
import nagiosplugin
""" """
Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor
Not implemented: Not implemented:
@ -33,8 +32,8 @@ class GPUTemp(nagiosplugin.Resource):
result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
capture_output=True, text=True, check=True) capture_output=True, text=True, check=True)
temp_celsius = float(result.stdout.strip()) temp_celsius = float(result.stdout.strip())
temp_fahrenheit = (temp_celsius * 9 / 5) + 32 # temp_fahrenheit = (temp_celsius * 9 / 5) + 32
return nagiosplugin.Metric('temperature', temp_fahrenheit, uom='F', context='temperature') return nagiosplugin.Metric('temperature', temp_celsius, uom='C', context='temperature')
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
raise nagiosplugin.CheckError("Failed to execute nvidia-smi") raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
except ValueError: except ValueError:
@ -162,7 +161,7 @@ class GPUThrottleReasons(nagiosplugin.Resource):
active_throttle_reasons = [] active_throttle_reasons = []
for i, reason in enumerate(self.throttle_reasons.keys()): for i, reason in enumerate(self.throttle_reasons.keys()):
if i < len(throttle_data) and throttle_data[i] == "Active": if i < len(throttle_data) and throttle_data[i] == "Active":
active_throttle_reasons.append(f"{reason}: {self.explanations[reason]}") active_throttle_reasons.append(f"{reason} ({self.explanations[reason]})")
return nagiosplugin.Metric('throttle_reasons', '\n'.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons') return nagiosplugin.Metric('throttle_reasons', '\n'.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons')
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
@ -253,10 +252,10 @@ def main():
help='warning threshold memory utilization in percentage') help='warning threshold memory utilization in percentage')
argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99', argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99',
help='critical threshold memory utilization in percentage') help='critical threshold memory utilization in percentage')
argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80', # argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80',
help='warning threshold fan speed in percentage') # help='warning threshold fan speed in percentage')
argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95', # argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95',
help='critical threshold fan speed in percentage') # help='critical threshold fan speed in percentage')
argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150', argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150',
help='warning threshold power usage in watts') help='warning threshold power usage in watts')
argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200', argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200',
@ -278,8 +277,8 @@ def main():
nagiosplugin.ScalarContext('total_memory'), nagiosplugin.ScalarContext('total_memory'),
GPUName(args.gpu_index), GPUName(args.gpu_index),
nagiosplugin.Context('gpu_name'), nagiosplugin.Context('gpu_name'),
GPUFanSpeed(args.gpu_index), # GPUFanSpeed(args.gpu_index),
nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical), # nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical),
GPUPowerUsage(args.gpu_index), GPUPowerUsage(args.gpu_index),
nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical), nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical),
GPUPCIeLink(args.gpu_index), GPUPCIeLink(args.gpu_index),