diff --git a/check_nvidia.py b/check_nvidia.py index 2654212..efa37fc 100755 --- a/check_nvidia.py +++ b/check_nvidia.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 import argparse +import nagiosplugin import re import subprocess -import nagiosplugin - """ Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor Not implemented: @@ -33,8 +32,8 @@ class GPUTemp(nagiosplugin.Resource): result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], capture_output=True, text=True, check=True) temp_celsius = float(result.stdout.strip()) - temp_fahrenheit = (temp_celsius * 9 / 5) + 32 - return nagiosplugin.Metric('temperature', temp_fahrenheit, uom='F', context='temperature') + # temp_fahrenheit = (temp_celsius * 9 / 5) + 32 + return nagiosplugin.Metric('temperature', temp_celsius, uom='C', context='temperature') except subprocess.CalledProcessError: raise nagiosplugin.CheckError("Failed to execute nvidia-smi") except ValueError: @@ -162,9 +161,9 @@ class GPUThrottleReasons(nagiosplugin.Resource): active_throttle_reasons = [] for i, reason in enumerate(self.throttle_reasons.keys()): if i < len(throttle_data) and throttle_data[i] == "Active": - active_throttle_reasons.append(f"{reason}: {self.explanations[reason]}") + active_throttle_reasons.append(f"{reason} ({self.explanations[reason]})") - return nagiosplugin.Metric('throttle_reasons', '\n'.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons') + return nagiosplugin.Metric('throttle_reasons', '; '.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons') except subprocess.CalledProcessError: raise nagiosplugin.CheckError("Failed to execute nvidia-smi") except KeyError: @@ -188,15 +187,15 @@ class GPUPowerState(nagiosplugin.Resource): class GPUSummary(nagiosplugin.Summary): def ok(self, results): - return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format( + return "{}, Memory Utilization: {}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {}°C, Power Usage: {} W, PCIe Link: Gen{} x{}".format( results['gpu_name'].metric.value, - results['memory_util'].metric.value, + int(results['memory_util'].metric.value), results['used_memory'].metric.value, results['total_memory'].metric.value, results['power_state'].metric.value, - results['temperature'].metric.value, - results['fan_speed'].metric.value, - results['power_usage'].metric.value, + int(results['temperature'].metric.value), + # results['fan_speed'].metric.value, # Fan Speed: {:.1f}% + int(results['power_usage'].metric.value), results['pcie_link_gen'].metric.value, results['pcie_link_width'].metric.value ) @@ -214,10 +213,10 @@ class GPUSummary(nagiosplugin.Summary): elif results['memory_util'].state == nagiosplugin.state.Warn: problem_parts.append("Memory utilization is high") - if results['fan_speed'].state == nagiosplugin.state.Critical: - problem_parts.append("Fan speed is critically low") - elif results['fan_speed'].state == nagiosplugin.state.Warn: - problem_parts.append("Fan speed is low") + # if results['fan_speed'].state == nagiosplugin.state.Critical: + # problem_parts.append("Fan speed is critically low") + # elif results['fan_speed'].state == nagiosplugin.state.Warn: + # problem_parts.append("Fan speed is low") if results['power_usage'].state == nagiosplugin.state.Critical: problem_parts.append("Power usage is critically high") @@ -245,18 +244,18 @@ def main(): argp = argparse.ArgumentParser(description="Check NVIDIA GPU temperature and memory utilization") argp.add_argument('-i', '--gpu-index', metavar='INDEX', default='0', help='index of the GPU to check (default: 0)') - argp.add_argument('-w', '--warning', metavar='RANGE', default='0:175', - help='warning threshold temperature in Fahrenheit') - argp.add_argument('-c', '--critical', metavar='RANGE', default='0:194', - help='critical threshold temperature in Fahrenheit') + argp.add_argument('-w', '--warning', metavar='RANGE', default='0:80', + help='warning threshold temperature in Celsius') + argp.add_argument('-c', '--critical', metavar='RANGE', default='0:90', + help='critical threshold temperature in Celsius') argp.add_argument('-mw', '--memory-warning', metavar='RANGE', default='0:95', help='warning threshold memory utilization in percentage') argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99', help='critical threshold memory utilization in percentage') - argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80', - help='warning threshold fan speed in percentage') - argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95', - help='critical threshold fan speed in percentage') + # argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80', + # help='warning threshold fan speed in percentage') + # argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95', + # help='critical threshold fan speed in percentage') argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150', help='warning threshold power usage in watts') argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200', @@ -278,8 +277,8 @@ def main(): nagiosplugin.ScalarContext('total_memory'), GPUName(args.gpu_index), nagiosplugin.Context('gpu_name'), - GPUFanSpeed(args.gpu_index), - nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical), + # GPUFanSpeed(args.gpu_index), + # nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical), GPUPowerUsage(args.gpu_index), nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical), GPUPCIeLink(args.gpu_index),