diff --git a/check_nvidia.py b/check_nvidia.py new file mode 100755 index 0000000..80a8ddb --- /dev/null +++ b/check_nvidia.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 +import argparse +import subprocess + +import nagiosplugin + +""" +Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor +Not implemented: +ECC errors for various memory locations (ECCMemAggSgl, ECCL1AggSgl, ECCL2AggSgl, ECCRegAggSgl, ECCTexAggSgl) +Double bit ECC errors +Persistence mode +Inforom checksum validity +""" + + +class BlankName(nagiosplugin.Resource): + @property + def name(self): + return '' + + def probe(self): + return nagiosplugin.Metric('blank_name', None, context='blank_name') + + +class GPUTemp(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + temp_celsius = float(result.stdout.strip()) + temp_fahrenheit = (temp_celsius * 9 / 5) + 32 + return nagiosplugin.Metric('temperature', temp_fahrenheit, uom='F', context='temperature') + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + except ValueError: + raise nagiosplugin.CheckError("Failed to parse temperature") + + +class GPUMemoryUtil(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + memory_info = result.stdout.strip().split(',') + used_memory = int(memory_info[0]) + total_memory = int(memory_info[1]) + memory_util = int((used_memory / total_memory) * 100) + used_memory_gb = round(used_memory / 1024, 1) + total_memory_gb = round(total_memory / 1024, 1) + return [ + nagiosplugin.Metric('memory_util', memory_util, uom='%', context='memory_util'), + nagiosplugin.Metric('used_memory', used_memory_gb, uom='GB', context='used_memory'), + nagiosplugin.Metric('total_memory', total_memory_gb, uom='GB', context='total_memory') + ] + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + except (ValueError, IndexError): + raise nagiosplugin.CheckError("Failed to parse memory utilization") + + +class GPUName(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + gpu_name = result.stdout.strip().strip('NVIDIA ') + return nagiosplugin.Metric('gpu_name', gpu_name, context='gpu_name') + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + + +class GPUFanSpeed(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=fan.speed', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + fan_speed = int(result.stdout.strip()) + return nagiosplugin.Metric('fan_speed', fan_speed, uom='%', context='fan_speed') + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + except ValueError: + raise nagiosplugin.CheckError("Failed to parse fan speed") + + +class GPUPowerUsage(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=power.draw', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + power_usage = round(float(result.stdout.strip()), 1) + return nagiosplugin.Metric('power_usage', power_usage, uom='W', context='power_usage') + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + except ValueError: + raise nagiosplugin.CheckError("Failed to parse power usage") + + +class GPUPCIeLink(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + + def probe(self): + try: + result = subprocess.run(['nvidia-smi', '--query-gpu=pcie.link.gen.current,pcie.link.width.current', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + pcie_info = result.stdout.strip().split(',') + current_link_gen = int(pcie_info[0]) + current_link_width = int(pcie_info[1]) + return [ + nagiosplugin.Metric('pcie_link_gen', current_link_gen, context='pcie_link_gen'), + nagiosplugin.Metric('pcie_link_width', current_link_width, context='pcie_link_width') + ] + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + except (ValueError, IndexError): + raise nagiosplugin.CheckError("Failed to parse PCIe link information") + + +class GPUThrottleReasons(nagiosplugin.Resource): + def __init__(self, gpu_index): + self.gpu_index = gpu_index + self.throttle_reasons = { + "Applications Clocks Setting": "clocks_throttle_reasons.applications_clocks_setting", + "SW Power Cap": "clocks_throttle_reasons.sw_power_cap", + "HW Slowdown": "clocks_throttle_reasons.hw_slowdown", + "HW Thermal Slowdown": "clocks_throttle_reasons.hw_thermal_slowdown", + "HW Power Brake Slowdown": "clocks_throttle_reasons.hw_power_brake_slowdown", + "SW Thermal Slowdown": "clocks_throttle_reasons.sw_thermal_slowdown" + } + self.explanations = { + "Applications Clocks Setting": "GPU clocks are limited by the applications clocks setting", + "SW Power Cap": "the SW Power Scaling algorithm is reducing the clocks because the GPU is consuming too much power", + "HW Slowdown": "this can be caused by HW Thermal Slowdown (temperature being too high) or HW Power Brake Slowdown (power draw is too high)", + "HW Thermal Slowdown": "the GPU temperature is too high", + "HW Power Brake Slowdown": "the power draw is too high", + "SW Thermal Slowdown": "the GPU temperature is higher than the maximum operating temperature" + } + + def probe(self): + try: + query_fields = ','.join(self.throttle_reasons.values()) + result = subprocess.run(['nvidia-smi', f'--query-gpu={query_fields}', '--format=csv,noheader', f'--id={self.gpu_index}'], + capture_output=True, text=True, check=True) + throttle_data = result.stdout.strip().split(', ') + active_throttle_reasons = [] + for i, reason in enumerate(self.throttle_reasons.keys()): + if i < len(throttle_data) and throttle_data[i] == "Active": + active_throttle_reasons.append(f"{reason}: {self.explanations[reason]}") + + return nagiosplugin.Metric('throttle_reasons', '\n'.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons') + except subprocess.CalledProcessError: + raise nagiosplugin.CheckError("Failed to execute nvidia-smi") + except KeyError: + raise nagiosplugin.CheckError("Unknown throttle reason") + + +class GPUSummary(nagiosplugin.Summary): + def ok(self, results): + return "{}, Memory Utilization: {:.1f}% ({:.1f} GB / {:.1f} GB), Temperature: {:.1f}°F, Fan Speed: {:.1f}%, Power Usage: {:.1f} W, PCIe Link: Gen{} x{}".format( + results['gpu_name'].metric.value, + results['memory_util'].metric.value, + results['used_memory'].metric.value, + results['total_memory'].metric.value, + results['temperature'].metric.value, + results['fan_speed'].metric.value, + results['power_usage'].metric.value, + results['pcie_link_gen'].metric.value, + results['pcie_link_width'].metric.value + ) + + def problem(self, results): + problem_parts = [] + + if results['temperature'].state == nagiosplugin.state.Critical: + problem_parts.append("Temperature is critically high") + elif results['temperature'].state == nagiosplugin.state.Warn: + problem_parts.append("Temperature is high") + + if results['memory_util'].state == nagiosplugin.state.Critical: + problem_parts.append("Memory utilization is critically high") + elif results['memory_util'].state == nagiosplugin.state.Warn: + problem_parts.append("Memory utilization is high") + + if results['fan_speed'].state == nagiosplugin.state.Critical: + problem_parts.append("Fan speed is critically low") + elif results['fan_speed'].state == nagiosplugin.state.Warn: + problem_parts.append("Fan speed is low") + + if results['power_usage'].state == nagiosplugin.state.Critical: + problem_parts.append("Power usage is critically high") + elif results['power_usage'].state == nagiosplugin.state.Warn: + problem_parts.append("Power usage is high") + + if results['pcie_link_gen'].state == nagiosplugin.state.Critical: + problem_parts.append("PCIe link generation is critically low") + elif results['pcie_link_gen'].state == nagiosplugin.state.Warn: + problem_parts.append("PCIe link generation is low") + + if results['pcie_link_width'].state == nagiosplugin.state.Critical: + problem_parts.append("PCIe link width is critically low") + elif results['pcie_link_width'].state == nagiosplugin.state.Warn: + problem_parts.append("PCIe link width is low") + + if results['throttle_reasons'].metric.value != 'None': + problem_parts.append("Hardware throttling detected: " + results['throttle_reasons'].metric.value) + + return "{} -- {}".format(", ".join(problem_parts), self.ok(results)) + + +@nagiosplugin.guarded +def main(): + argp = argparse.ArgumentParser(description="Check NVIDIA GPU temperature and memory utilization") + argp.add_argument('-i', '--gpu-index', metavar='INDEX', default='0', + help='index of the GPU to check (default: 0)') + argp.add_argument('-w', '--warning', metavar='RANGE', default='0:175', + help='warning threshold temperature in Fahrenheit') + argp.add_argument('-c', '--critical', metavar='RANGE', default='0:194', + help='critical threshold temperature in Fahrenheit') + argp.add_argument('-mw', '--memory-warning', metavar='RANGE', default='0:95', + help='warning threshold memory utilization in percentage') + argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99', + help='critical threshold memory utilization in percentage') + argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80', + help='warning threshold fan speed in percentage') + argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95', + help='critical threshold fan speed in percentage') + argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150', + help='warning threshold power usage in watts') + argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200', + help='critical threshold power usage in watts') + argp.add_argument('-pgc', '--pcie-link-gen-critical', metavar='VALUE', default=1, + help='critical threshold PCIe link generation, must be 1 less than the lowest allowed') + argp.add_argument('-pwc', '--pcie-link-width-critical', metavar='VALUE', default=15, + help='critical threshold PCIe link width, must be 1 less than the lowest allowed') + args = argp.parse_args() + + check = nagiosplugin.Check( + BlankName(), + nagiosplugin.Context('blank_name'), + GPUTemp(args.gpu_index), + nagiosplugin.ScalarContext('temperature', args.warning, args.critical), + GPUMemoryUtil(args.gpu_index), + nagiosplugin.ScalarContext('memory_util', args.memory_warning, args.memory_critical), + nagiosplugin.ScalarContext('used_memory'), + nagiosplugin.ScalarContext('total_memory'), + GPUName(args.gpu_index), + nagiosplugin.Context('gpu_name'), + GPUFanSpeed(args.gpu_index), + nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical), + GPUPowerUsage(args.gpu_index), + nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical), + GPUPCIeLink(args.gpu_index), + nagiosplugin.ScalarContext('pcie_link_gen', '', f'@0:{args.pcie_link_gen_critical}'), + nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'), + GPUThrottleReasons(args.gpu_index), + nagiosplugin.Context('throttle_reasons'), + GPUSummary() + ) + check.main() + + +if __name__ == '__main__': + main()