2023-04-21 23:54:20 -06:00
#!/usr/bin/env python3
import argparse
import json
import os
import sys
import traceback
from pathlib import Path
import certifi
import numpy as np
import requests
2023-09-09 12:03:47 -06:00
from proxmoxer import ProxmoxAPI , ResourceException
2023-04-21 23:54:20 -06:00
import checker . nagios as nagios
from checker . markdown import list_to_markdown_table
from checker . units import filesize
parser = argparse . ArgumentParser ( description = ' Check the Proxmox API for network traffic for a host. ' )
parser . add_argument ( ' --node ' , required = True , help = ' The name and address of Proxmox node in valid JSON in this format: [ " bigserver " , " 192.168.1.222 " ]. This allows you to use datalists in Director. ' )
2023-09-09 12:03:47 -06:00
parser . add_argument ( ' --user ' , required = True , help = ' The Proxmox user. Something like " monitoring@pve " ' )
2023-09-09 12:15:36 -06:00
parser . add_argument ( ' --password ' , required = True , help = ' API password. ' )
2023-04-21 23:54:20 -06:00
parser . add_argument ( ' --host ' , required = True , help = ' The ID of the host to check. ' )
parser . add_argument ( ' --type ' , required = True , choices = [ ' qemu ' , ' lxc ' ] , help = ' Type of host. " qemu " or " lxc " ' )
parser . add_argument ( ' --metrics ' , required = True , help = ' What stats to check. Can list multiple seperated by commas. For example, " netin,netout " ' )
parser . add_argument ( ' --levels ' , required = True , help = ' Warning levels. In JSON format: { " netin " : { " warn " :50, " crit " :100, " type " : " filesize " }, " netout " : { " warn " :50, " crit " :100, " type " : " filesize " }} ' )
parser . add_argument ( ' --timeframe ' , default = 5 , help = ' Timeframe to average the data to in minutes. Default: 5 minutes ' )
2023-09-09 12:03:47 -06:00
parser . add_argument ( ' --verify ' , default = False ,
help = " What to verify the SSL connection with. Can be a file path, or false to disable verification. If you ' re having issues with CA certs, try setting it to your system ' s CA bundle (/etc/ssl/certs/ca-certificates.crt). Default: false (verification disabled) " )
2023-04-21 23:54:20 -06:00
parser . add_argument ( ' --verify-force ' , action = ' store_true ' , help = " Delete the certifi cert and replace it with whatever you specify in --verify " )
parser . add_argument ( ' --table ' , action = ' store_true ' , help = ' Print the results in a table. ' )
args = parser . parse_args ( )
# def where():
# return args.verify
def main ( ) :
if args . verify_force :
if not args . verify :
print ( ' UNKNOWN: must supply --verify when using --verify-force ' )
sys . exit ( nagios . UNKNOWN )
if Path ( certifi . where ( ) ) . exists ( ) :
os . remove ( certifi . where ( ) )
os . symlink ( args . verify , certifi . where ( ) )
print ( f ' Pointed { certifi . where ( ) } to { args . verify } ' )
if Path ( requests . certs . where ( ) ) . exists ( ) :
os . remove ( requests . certs . where ( ) )
os . symlink ( args . verify , requests . certs . where ( ) )
print ( f ' Pointed { requests . certs . where ( ) } to { args . verify } ' )
try :
metrics_levels = json . loads ( args . levels )
except Exception as e :
print ( ' UNKNOWN: Failed to parse --levels JSON: ' , e )
sys . exit ( nagios . UNKNOWN )
2023-09-09 12:23:34 -06:00
for k , v in metrics_levels . items ( ) :
metrics_levels [ k ] [ ' min ' ] = metrics_levels [ k ] . get ( ' min ' )
if isinstance ( metrics_levels [ k ] [ ' min ' ] , float ) :
metrics_levels [ k ] [ ' min ' ] = int ( metrics_levels [ k ] [ ' min ' ] )
if isinstance ( metrics_levels [ k ] [ ' warn ' ] , float ) :
metrics_levels [ k ] [ ' warn ' ] = int ( metrics_levels [ k ] [ ' warn ' ] )
if isinstance ( metrics_levels [ k ] [ ' crit ' ] , float ) :
metrics_levels [ k ] [ ' crit ' ] = int ( metrics_levels [ k ] [ ' crit ' ] )
2023-04-21 23:54:20 -06:00
try :
args . node = json . loads ( args . node )
pve_node = args . node [ 0 ]
pve_node_address = args . node [ 1 ]
except Exception as e :
print ( ' UNKNOWN: Failed to parse --node JSON: ' , e )
sys . exit ( nagios . UNKNOWN )
2023-09-09 12:15:36 -06:00
prox = ProxmoxAPI ( pve_node_address , user = args . user , password = args . password , verify_ssl = args . verify )
2023-09-09 12:03:47 -06:00
try :
user_perms = prox ( ' access/permissions ' ) . get ( )
except Exception as e :
user_perms = f ' { e . __class__ . __name__ } : { e } '
2023-04-21 23:54:20 -06:00
try :
2023-09-09 12:03:47 -06:00
t = prox . nodes ( pve_node ) . status . get ( ) # test connection and permissions
if not len ( t ) :
print ( ' UNKNOWN: PVE API returned no nodes. ' )
sys . exit ( nagios . UNKNOWN )
2023-04-21 23:54:20 -06:00
except requests . exceptions . SSLError as e :
2023-09-09 12:03:47 -06:00
print ( ' UNKNOWN: SSL error ' , e )
2023-04-21 23:54:20 -06:00
print ( ' Using cert: ' , args . verify )
print ( ' certifi using cert: ' , certifi . where ( ) )
print ( ' requests using cert: ' , requests . certs . where ( ) )
sys . exit ( nagios . UNKNOWN )
2023-09-09 12:03:47 -06:00
except ResourceException as e :
print ( ' UNKNOWN: ' , e )
2023-09-09 12:15:36 -06:00
print ( f ' Proxmox reported " { args . user } " permissions as: ' , user_perms )
2023-09-09 12:03:47 -06:00
sys . exit ( nagios . UNKNOWN )
except Exception as e :
print ( ' UNKNOWN: failed to connect to Proxmox API: ' , e )
2023-04-21 23:54:20 -06:00
try :
2023-09-09 12:03:47 -06:00
api_data = prox ( f ' nodes/ { pve_node } / { args . type } / { args . host } /rrddata?timeframe=hour ' ) . get ( )
2023-04-21 23:54:20 -06:00
except Exception as e :
2023-09-09 12:03:47 -06:00
print ( f ' UNKNOWN: Failed to fetch API data - ' , f ' { e . __class__ . __name__ } : { e } ' )
2023-04-21 23:54:20 -06:00
sys . exit ( nagios . UNKNOWN )
# Load the data
metrics_data = { }
for item in args . metrics . split ( ' , ' ) :
if item not in metrics_levels . keys ( ) :
print ( f ' UNKNOWN: missing metric " { item } " in --levels ' )
sys . exit ( nagios . UNKNOWN )
if ' warn ' not in metrics_levels [ item ] . keys ( ) :
print ( f ' UNKNOWN: missing key " warn " for metric " { item } " in --levels ' )
sys . exit ( nagios . UNKNOWN )
if ' crit ' not in metrics_levels [ item ] . keys ( ) :
print ( f ' UNKNOWN: missing key " crit " for metric " { item } " in --levels ' )
sys . exit ( nagios . UNKNOWN )
if ' type ' not in metrics_levels [ item ] . keys ( ) :
print ( f ' UNKNOWN: missing key " type " for metric " { item } " in --levels ' )
sys . exit ( nagios . UNKNOWN )
metrics_data [ item ] = [ ]
for m in api_data :
for k , v in m . items ( ) :
if k == item :
if isinstance ( v , float ) :
v = np . round ( v , 2 )
metrics_data [ item ] . append ( v )
check_data = { }
exit_code = nagios . OK
for metric , value in metrics_data . items ( ) :
check_data [ metric ] = { }
# Average the data. Expects the interval to be 1 minute
2023-05-06 14:51:50 -06:00
if len ( value ) > 0 :
avg = np . round ( np . average ( value [ - 5 : - 1 ] ) , 2 ) # TODO: why [-5:-1]
check_data [ metric ] [ ' nan ' ] = False
else :
# Prevent NaN errors
check_data [ metric ] [ ' nan ' ] = True
check_data [ metric ] [ ' value_str ' ] = ' NaN '
continue
2023-04-21 23:54:20 -06:00
check_data [ metric ] [ ' value ' ] = avg
if metrics_levels [ metric ] [ ' type ' ] == ' filesize ' :
check_data [ metric ] [ ' value_str ' ] = filesize ( avg )
2023-04-21 23:54:20 -06:00
check_data [ metric ] [ ' value ' ] = f ' { int ( avg ) } B '
2023-04-21 23:54:20 -06:00
else :
check_data [ metric ] [ ' value_str ' ] = str ( avg )
if avg > = metrics_levels [ metric ] [ ' crit ' ] :
check_data [ metric ] [ ' status ' ] = nagios . CRITICAL
check_data [ metric ] [ ' status_str ' ] = ' [CRITICAL] '
elif avg > = metrics_levels [ metric ] [ ' warn ' ] :
2023-05-06 14:51:50 -06:00
check_data [ metric ] [ ' status ' ] = nagios . WARNING
2023-04-21 23:54:20 -06:00
check_data [ metric ] [ ' status_str ' ] = ' [WARNING] '
else :
check_data [ metric ] [ ' status ' ] = nagios . OK
check_data [ metric ] [ ' status_str ' ] = ' [OK] '
if exit_code < check_data [ metric ] [ ' status ' ] :
exit_code = check_data [ metric ] [ ' status ' ]
if exit_code == nagios . OK :
output_str = ' OK: '
elif exit_code == nagios . WARNING :
output_str = ' WARNING: '
elif exit_code == nagios . CRITICAL :
output_str = ' CRITICAL: '
2023-05-06 14:51:50 -06:00
else :
output_str = ' UNKNOWN: '
# Check for NaNs
for metric , data in check_data . items ( ) :
if check_data [ metric ] [ ' nan ' ] :
output_str = ' UNKNOWN: '
exit_code = nagios . UNKNOWN
2023-04-21 23:54:20 -06:00
perf_data = [ ]
for metric , data in check_data . items ( ) :
output_str = output_str + f " { metric } { data [ ' value_str ' ] } , "
2023-05-06 14:51:50 -06:00
if not check_data [ metric ] [ ' nan ' ] :
2023-09-09 12:23:34 -06:00
if metrics_levels [ metric ] [ ' min ' ] is None :
perf_data . append ( f " ' { metric } ' = { data [ ' value ' ] } ; { metrics_levels [ metric ] [ ' warn ' ] } ; { metrics_levels [ metric ] [ ' crit ' ] } ;; " )
else :
perf_data . append ( f " ' { metric } ' = { data [ ' value ' ] } ; { metrics_levels [ metric ] [ ' warn ' ] } ; { metrics_levels [ metric ] [ ' crit ' ] } ; { metrics_levels [ metric ] [ ' min ' ] } ; " )
2023-04-21 23:54:20 -06:00
print ( output_str . strip ( ' , ' ) . strip ( ) , end = ( ' \n ' if args . table else ' ' ) )
2023-05-06 14:51:50 -06:00
if len ( perf_data ) :
perf_data_str = f ' | { " " . join ( perf_data ) } '
else :
perf_data_str = ' '
2023-04-21 23:54:20 -06:00
if args . table :
output_table = [ ( ' Metric ' , ' Value ' , ' Status ' ) ]
for metric , data in check_data . items ( ) :
output_table . append ( ( metric , data [ ' value_str ' ] , data [ ' status_str ' ] ) )
print ( list_to_markdown_table ( output_table , align = ' left ' , seperator = ' ! ' , borders = False ) )
# else:
# perf_data_str = ' ' + perf_data_str
print ( perf_data_str )
sys . exit ( exit_code )
if __name__ == " __main__ " :
try :
main ( )
except Exception as e :
print ( f ' UNKNOWN: exception " { e } " ' )
print ( traceback . format_exc ( ) )
sys . exit ( nagios . UNKNOWN )