298 lines
14 KiB
Python
Executable File
298 lines
14 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
import nagiosplugin
|
|
import re
|
|
import subprocess
|
|
|
|
"""
|
|
Based on https://github.com/thomas-krenn/check_gpu_sensor_v1/blob/master/check_gpu_sensor
|
|
Not implemented:
|
|
ECC errors for various memory locations (ECCMemAggSgl, ECCL1AggSgl, ECCL2AggSgl, ECCRegAggSgl, ECCTexAggSgl)
|
|
Double bit ECC errors
|
|
Persistence mode
|
|
Inforom checksum validity
|
|
"""
|
|
|
|
|
|
class BlankName(nagiosplugin.Resource):
|
|
@property
|
|
def name(self):
|
|
return ''
|
|
|
|
def probe(self):
|
|
return nagiosplugin.Metric('blank_name', None, context='blank_name')
|
|
|
|
|
|
class GPUTemp(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
temp_celsius = float(result.stdout.strip())
|
|
# temp_fahrenheit = (temp_celsius * 9 / 5) + 32
|
|
return nagiosplugin.Metric('temperature', temp_celsius, uom='C', context='temperature')
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
except ValueError:
|
|
raise nagiosplugin.CheckError("Failed to parse temperature")
|
|
|
|
|
|
class GPUMemoryUtil(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
memory_info = result.stdout.strip().split(',')
|
|
used_memory = int(memory_info[0])
|
|
total_memory = int(memory_info[1])
|
|
memory_util = int((used_memory / total_memory) * 100)
|
|
used_memory_gb = round(used_memory / 1024, 1)
|
|
total_memory_gb = round(total_memory / 1024, 1)
|
|
return [
|
|
nagiosplugin.Metric('memory_util', memory_util, uom='%', context='memory_util'),
|
|
nagiosplugin.Metric('used_memory', used_memory_gb, uom='GB', context='used_memory'),
|
|
nagiosplugin.Metric('total_memory', total_memory_gb, uom='GB', context='total_memory')
|
|
]
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
except (ValueError, IndexError):
|
|
raise nagiosplugin.CheckError("Failed to parse memory utilization")
|
|
|
|
|
|
class GPUName(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
gpu_name = result.stdout.strip().strip('NVIDIA ')
|
|
return nagiosplugin.Metric('gpu_name', gpu_name, context='gpu_name')
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
|
|
|
|
class GPUFanSpeed(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=fan.speed', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
fan_speed = int(result.stdout.strip())
|
|
return nagiosplugin.Metric('fan_speed', fan_speed, uom='%', context='fan_speed')
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
except ValueError:
|
|
raise nagiosplugin.CheckError("Failed to parse fan speed")
|
|
|
|
|
|
class GPUPowerUsage(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=power.draw', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
power_usage = round(float(result.stdout.strip()), 1)
|
|
return nagiosplugin.Metric('power_usage', power_usage, uom='W', context='power_usage')
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
except ValueError:
|
|
raise nagiosplugin.CheckError("Failed to parse power usage")
|
|
|
|
|
|
class GPUPCIeLink(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=pcie.link.gen.current,pcie.link.width.current', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
pcie_info = result.stdout.strip().split(',')
|
|
current_link_gen = int(pcie_info[0])
|
|
current_link_width = int(pcie_info[1])
|
|
return [
|
|
nagiosplugin.Metric('pcie_link_gen', current_link_gen, context='pcie_link_gen'),
|
|
nagiosplugin.Metric('pcie_link_width', current_link_width, context='pcie_link_width')
|
|
]
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
except (ValueError, IndexError):
|
|
raise nagiosplugin.CheckError("Failed to parse PCIe link information")
|
|
|
|
|
|
class GPUThrottleReasons(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
self.throttle_reasons = {
|
|
"Applications Clocks Setting": "clocks_throttle_reasons.applications_clocks_setting",
|
|
"SW Power Cap": "clocks_throttle_reasons.sw_power_cap",
|
|
"HW Slowdown": "clocks_throttle_reasons.hw_slowdown",
|
|
"HW Thermal Slowdown": "clocks_throttle_reasons.hw_thermal_slowdown",
|
|
"HW Power Brake Slowdown": "clocks_throttle_reasons.hw_power_brake_slowdown",
|
|
"SW Thermal Slowdown": "clocks_throttle_reasons.sw_thermal_slowdown"
|
|
}
|
|
self.explanations = {
|
|
"Applications Clocks Setting": "GPU clocks are limited by the applications clocks setting",
|
|
"SW Power Cap": "the SW Power Scaling algorithm is reducing the clocks because the GPU is consuming too much power",
|
|
"HW Slowdown": "this can be caused by HW Thermal Slowdown (temperature being too high) or HW Power Brake Slowdown (power draw is too high)",
|
|
"HW Thermal Slowdown": "the GPU temperature is too high",
|
|
"HW Power Brake Slowdown": "the power draw is too high",
|
|
"SW Thermal Slowdown": "the GPU temperature is higher than the maximum operating temperature"
|
|
}
|
|
|
|
def probe(self):
|
|
try:
|
|
query_fields = ','.join(self.throttle_reasons.values())
|
|
result = subprocess.run(['nvidia-smi', f'--query-gpu={query_fields}', '--format=csv,noheader', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
throttle_data = result.stdout.strip().split(', ')
|
|
active_throttle_reasons = []
|
|
for i, reason in enumerate(self.throttle_reasons.keys()):
|
|
if i < len(throttle_data) and throttle_data[i] == "Active":
|
|
active_throttle_reasons.append(f"{reason} ({self.explanations[reason]})")
|
|
|
|
return nagiosplugin.Metric('throttle_reasons', '; '.join(active_throttle_reasons) if active_throttle_reasons else 'None', context='throttle_reasons')
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
except KeyError:
|
|
raise nagiosplugin.CheckError("Unknown throttle reason")
|
|
|
|
|
|
class GPUPowerState(nagiosplugin.Resource):
|
|
def __init__(self, gpu_index):
|
|
self.gpu_index = gpu_index
|
|
|
|
def probe(self):
|
|
try:
|
|
result = subprocess.run(['nvidia-smi', '--query-gpu=pstate', '--format=csv,noheader,nounits', f'--id={self.gpu_index}'],
|
|
capture_output=True, text=True, check=True)
|
|
power_state_str = result.stdout.strip()
|
|
power_state_int = int(re.match(r'P([0-9]*)', power_state_str).group(1))
|
|
return nagiosplugin.Metric('power_state', power_state_int, context='power_state')
|
|
except subprocess.CalledProcessError:
|
|
raise nagiosplugin.CheckError("Failed to execute nvidia-smi")
|
|
|
|
|
|
class GPUSummary(nagiosplugin.Summary):
|
|
def ok(self, results):
|
|
return "{}, Memory Utilization: {}% ({:.1f} GB / {:.1f} GB), Power State: P{}, Temperature: {}°C, Power Usage: {} W, PCIe Link: Gen{} x{}".format(
|
|
results['gpu_name'].metric.value,
|
|
int(results['memory_util'].metric.value),
|
|
results['used_memory'].metric.value,
|
|
results['total_memory'].metric.value,
|
|
results['power_state'].metric.value,
|
|
int(results['temperature'].metric.value),
|
|
# results['fan_speed'].metric.value, # Fan Speed: {:.1f}%
|
|
int(results['power_usage'].metric.value),
|
|
results['pcie_link_gen'].metric.value,
|
|
results['pcie_link_width'].metric.value
|
|
)
|
|
|
|
def problem(self, results):
|
|
problem_parts = []
|
|
|
|
if results['temperature'].state == nagiosplugin.state.Critical:
|
|
problem_parts.append("Temperature is critically high")
|
|
elif results['temperature'].state == nagiosplugin.state.Warn:
|
|
problem_parts.append("Temperature is high")
|
|
|
|
if results['memory_util'].state == nagiosplugin.state.Critical:
|
|
problem_parts.append("Memory utilization is critically high")
|
|
elif results['memory_util'].state == nagiosplugin.state.Warn:
|
|
problem_parts.append("Memory utilization is high")
|
|
|
|
# if results['fan_speed'].state == nagiosplugin.state.Critical:
|
|
# problem_parts.append("Fan speed is critically low")
|
|
# elif results['fan_speed'].state == nagiosplugin.state.Warn:
|
|
# problem_parts.append("Fan speed is low")
|
|
|
|
if results['power_usage'].state == nagiosplugin.state.Critical:
|
|
problem_parts.append("Power usage is critically high")
|
|
elif results['power_usage'].state == nagiosplugin.state.Warn:
|
|
problem_parts.append("Power usage is high")
|
|
|
|
if results['pcie_link_gen'].state == nagiosplugin.state.Critical:
|
|
problem_parts.append("PCIe link generation is critically low")
|
|
elif results['pcie_link_gen'].state == nagiosplugin.state.Warn:
|
|
problem_parts.append("PCIe link generation is low")
|
|
|
|
if results['pcie_link_width'].state == nagiosplugin.state.Critical:
|
|
problem_parts.append("PCIe link width is critically low")
|
|
elif results['pcie_link_width'].state == nagiosplugin.state.Warn:
|
|
problem_parts.append("PCIe link width is low")
|
|
|
|
if results['throttle_reasons'].metric.value != 'None':
|
|
problem_parts.append("Hardware throttling detected: " + results['throttle_reasons'].metric.value)
|
|
|
|
return "{} -- {}".format(", ".join(problem_parts), self.ok(results))
|
|
|
|
|
|
@nagiosplugin.guarded
|
|
def main():
|
|
argp = argparse.ArgumentParser(description="Check NVIDIA GPU temperature and memory utilization")
|
|
argp.add_argument('-i', '--gpu-index', metavar='INDEX', default='0',
|
|
help='index of the GPU to check (default: 0)')
|
|
argp.add_argument('-w', '--warning', metavar='RANGE', default='0:80',
|
|
help='warning threshold temperature in Celsius')
|
|
argp.add_argument('-c', '--critical', metavar='RANGE', default='0:90',
|
|
help='critical threshold temperature in Celsius')
|
|
argp.add_argument('-mw', '--memory-warning', metavar='RANGE', default='0:95',
|
|
help='warning threshold memory utilization in percentage')
|
|
argp.add_argument('-mc', '--memory-critical', metavar='RANGE', default='0:99',
|
|
help='critical threshold memory utilization in percentage')
|
|
# argp.add_argument('-fw', '--fan-warning', metavar='RANGE', default='0:80',
|
|
# help='warning threshold fan speed in percentage')
|
|
# argp.add_argument('-fc', '--fan-critical', metavar='RANGE', default='0:95',
|
|
# help='critical threshold fan speed in percentage')
|
|
argp.add_argument('-pw', '--power-warning', metavar='RANGE', default='0:150',
|
|
help='warning threshold power usage in watts')
|
|
argp.add_argument('-pc', '--power-critical', metavar='RANGE', default='0:200',
|
|
help='critical threshold power usage in watts')
|
|
argp.add_argument('-pgc', '--pcie-link-gen-critical', metavar='VALUE', default=1,
|
|
help='critical threshold PCIe link generation, must be 1 less than the lowest allowed')
|
|
argp.add_argument('-pwc', '--pcie-link-width-critical', metavar='VALUE', default=15,
|
|
help='critical threshold PCIe link width, must be 1 less than the lowest allowed')
|
|
args = argp.parse_args()
|
|
|
|
check = nagiosplugin.Check(
|
|
BlankName(),
|
|
nagiosplugin.Context('blank_name'),
|
|
GPUTemp(args.gpu_index),
|
|
nagiosplugin.ScalarContext('temperature', args.warning, args.critical),
|
|
GPUMemoryUtil(args.gpu_index),
|
|
nagiosplugin.ScalarContext('memory_util', args.memory_warning, args.memory_critical),
|
|
nagiosplugin.ScalarContext('used_memory'),
|
|
nagiosplugin.ScalarContext('total_memory'),
|
|
GPUName(args.gpu_index),
|
|
nagiosplugin.Context('gpu_name'),
|
|
# GPUFanSpeed(args.gpu_index),
|
|
# nagiosplugin.ScalarContext('fan_speed', args.fan_warning, args.fan_critical),
|
|
GPUPowerUsage(args.gpu_index),
|
|
nagiosplugin.ScalarContext('power_usage', args.power_warning, args.power_critical),
|
|
GPUPCIeLink(args.gpu_index),
|
|
nagiosplugin.ScalarContext('pcie_link_gen', '', f'@0:{args.pcie_link_gen_critical}'),
|
|
nagiosplugin.ScalarContext('pcie_link_width', '', f'@0:{args.pcie_link_width_critical}'),
|
|
GPUThrottleReasons(args.gpu_index),
|
|
nagiosplugin.Context('throttle_reasons'),
|
|
GPUPowerState(args.gpu_index),
|
|
nagiosplugin.ScalarContext('power_state'),
|
|
GPUSummary()
|
|
)
|
|
check.main()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|