From 68c923914e752cefac542690b6fed281b5defd75 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Tue, 26 Nov 2024 21:20:53 -0700 Subject: [PATCH] check_nvidia: remove fan scheck, adjust output, report temp in C --- check_nvidia.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/check_nvidia.py b/check_nvidia.py index 2654212..4bd76da 100755 --- a/check_nvidia.py +++ b/check_nvidia.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 import argparse +import nagiosplugin import re import subprocess -import nagiosplugin - """ Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor Not implemented: @@ -33,8 +32,8 @@ class GPUTemp(nagiosplugin.Resource): result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], capture_output=True, text=True, check=True) temp_celsius = float(result.stdout.strip()) - temp_fahrenheit = (temp_celsius * 9 / 5) + 32 - return nagiosplugin.Metric('temperature', temp_fahrenheit, uom='F', context='temperature') + # temp_fahrenheit = (temp_celsius * 9 / 5) + 32 + return nagiosplugin.Metric('temperature', temp_celsius, uom='C', context='temperature') except subprocess.CalledProcessError: raise nagiosplugin.CheckError("Failed to execute nvidia-smi") except ValueError: @@ -162,7 +161,7 @@ class GPUThrottleReasons(nagiosplugin.Resource): active_throttle_reasons = [] for i, reason in enumerate(self.throttle_reasons.keys()): if i < len(throttle_data) and throttle_data[i] == "Active": - active_throttle_reasons.append(f"{reason}: {self.explanations[reason]}") + active_throttle_reasons.append(f"{reason} ({self.explanations[reason]})") return nagiosplugin.Metric('throttle_reasons', '\n'.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons') except subprocess.CalledProcessError: @@ -253,10 +252,10 @@ def main(): help='warning threshold memory utilization in percentage') argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99', help='critical threshold memory utilization in percentage') - argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80', - help='warning threshold fan speed in percentage') - argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95', - help='critical threshold fan speed in percentage') + # argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80', + # help='warning threshold fan speed in percentage') + # argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95', + # help='critical threshold fan speed in percentage') argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150', help='warning threshold power usage in watts') argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200', @@ -278,8 +277,8 @@ def main(): nagiosplugin.ScalarContext('total_memory'), GPUName(args.gpu_index), nagiosplugin.Context('gpu_name'), - GPUFanSpeed(args.gpu_index), - nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical), + # GPUFanSpeed(args.gpu_index), + # nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical), GPUPowerUsage(args.gpu_index), nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical), GPUPCIeLink(args.gpu_index),