updates
This commit is contained in:
parent
23fb350116
commit
d1665ef9d1
|
@ -0,0 +1,240 @@
|
|||
#!/bin/bash
|
||||
# startup checks
|
||||
|
||||
if [ -z "$BASH" ]; then
|
||||
echo "Please use BASH."
|
||||
exit 3
|
||||
fi
|
||||
if [ ! -e "/usr/bin/which" ]; then
|
||||
echo "/usr/bin/which is missing."
|
||||
exit 3
|
||||
fi
|
||||
curl=$(which curl)
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Please install curl."
|
||||
exit 3
|
||||
fi
|
||||
|
||||
|
||||
# Default Values
|
||||
proxy=""
|
||||
method="GET"
|
||||
body=""
|
||||
contains=""
|
||||
lacks=""
|
||||
insecure=0
|
||||
debug=0
|
||||
warning=700
|
||||
encodeurl=0
|
||||
critical=2000
|
||||
url=""
|
||||
follow=0
|
||||
header=""
|
||||
name="default"
|
||||
cookies=0
|
||||
|
||||
# Usage Info
|
||||
usage() {
|
||||
echo '''Usage: check_curl [OPTIONS]
|
||||
[OPTIONS]:
|
||||
-U URL Target URL
|
||||
-M METHOD HTTP Method (default: GET)
|
||||
-N NAME Display Name of scanned object (default: default)
|
||||
-B BODY Request Body to be sent (default: not sent)
|
||||
-E ENCODEURL Send body defined with url encoding (curl --data-urlencode) (default: off)
|
||||
-I INSECURE Sets the curl flag --insecure
|
||||
-C CONTAINS If not contained in response body, CRITICAL will be returned
|
||||
-L LACKS If contained in response body, CRITICAL will be returned (-C has priority when both are set)
|
||||
-w WARNING Warning threshold in milliseconds (default: 700)
|
||||
-c CRITICAL Critical threshold in milliseconds (default: 2000)
|
||||
-H HEADER Send Header (i.E. "AUTHORIZATION: Bearer 8*.UdUYwrl!nK")
|
||||
-F FOLLOW Follow redirects (default: OFF)
|
||||
-D DEBUG Only prints the curl command (default: OFF)
|
||||
-P PROXY Set Proxy Address (default: No Proxy)
|
||||
-K COOKIES Enables/Disabled cookie handling in a temporary cookie jar'''
|
||||
}
|
||||
|
||||
|
||||
# Check which threshold was reached
|
||||
checkTime() {
|
||||
if [ $1 -gt $critical ]; then
|
||||
echo -n "CRITICAL: Slow "
|
||||
elif [ $1 -gt $warning ]; then
|
||||
echo -n "WARNING: Slow "
|
||||
else
|
||||
echo -n "OK"
|
||||
fi
|
||||
}
|
||||
|
||||
# Return code value
|
||||
getStatus() {
|
||||
if [ $1 -gt $critical ]; then
|
||||
return 2
|
||||
elif [ $1 -gt $warning ]; then
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
#main
|
||||
#get options
|
||||
while getopts "P:M:B:C:w:c:U:H:IFN:O:EL:D:K" opt; do
|
||||
case $opt in
|
||||
K)
|
||||
cookies=1
|
||||
;;
|
||||
P)
|
||||
proxy=$OPTARG
|
||||
;;
|
||||
M)
|
||||
method=$OPTARG
|
||||
;;
|
||||
B)
|
||||
body=$OPTARG
|
||||
;;
|
||||
C)
|
||||
contains=$OPTARG
|
||||
;;
|
||||
w)
|
||||
warning=$OPTARG
|
||||
;;
|
||||
c)
|
||||
critical=$OPTARG
|
||||
;;
|
||||
U)
|
||||
url=$OPTARG
|
||||
;;
|
||||
L)
|
||||
lacks=$OPTARG
|
||||
;;
|
||||
I)
|
||||
insecure=1
|
||||
;;
|
||||
N)
|
||||
name=$( echo $OPTARG | sed -e 's/[^A-Za-z0-9._-]/_/g' )
|
||||
;;
|
||||
E)
|
||||
encodeurl=1
|
||||
;;
|
||||
H)
|
||||
header=$OPTARG
|
||||
;;
|
||||
F)
|
||||
follow=1
|
||||
;;
|
||||
D)
|
||||
debug=1
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 3
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
#hostname is required
|
||||
if [ -z "$url" ] || [ $# -eq 0 ]; then
|
||||
echo "Error: URL is required"
|
||||
usage
|
||||
exit 3
|
||||
fi
|
||||
|
||||
proxyarg=""
|
||||
if [ ! -z $proxy ] ; then
|
||||
proxyarg=" -x "$proxy" "
|
||||
fi
|
||||
headerarg=""
|
||||
if [ ! -z "$header" ] ; then
|
||||
headerarg=' -H "'$header'" '
|
||||
fi
|
||||
followarg=""
|
||||
if [ $follow -eq 1 ] ; then
|
||||
followarg=" -L "
|
||||
fi
|
||||
insecurearg=""
|
||||
if [ $insecure -eq 1 ] ; then
|
||||
insecurearg=" --insecure "
|
||||
fi
|
||||
cookiesarg=""
|
||||
if [ $cookies -eq 1 ] ; then
|
||||
COOKIE_JAR_TEMP_PATH=$(mktemp /tmp/check_curl_cookiejar.XXXXXX)
|
||||
cookiesarg=" -c ${COOKIE_JAR_TEMP_PATH} -b ${COOKIE_JAR_TEMP_PATH}"
|
||||
fi
|
||||
bodyarg=""
|
||||
if [ ! -z $body ]; then
|
||||
body=$(echo $body| sed "s/\"/\\\\\"/g")
|
||||
bodyarg=" --data \""$body"\""
|
||||
if [ $encodeurl -eq 1 ]; then
|
||||
bodyarg=" --data-urlencode \""$body"\""
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $debug -eq 1 ]; then
|
||||
echo $curl --no-keepalive -s $insecurearg $proxyarg $followarg $bodyarg $headerarg -X $method $cookiesarg "$url"
|
||||
exit 0
|
||||
else
|
||||
start=$(echo $(($(date +%s%N)/1000000)))
|
||||
body=$(eval $curl --no-keepalive -s $insecurearg $proxyarg $followarg $bodyarg $headerarg -X $method $cookiesarg "$url")
|
||||
status=$?
|
||||
fi
|
||||
|
||||
if [ $cookies -eq 1 ] ; then
|
||||
rm -f ${COOKIE_JAR_TEMP_PATH}
|
||||
fi
|
||||
|
||||
end=$(echo $(($(date +%s%N)/1000000)))
|
||||
#decide output by return code
|
||||
if [ $status -eq 0 ] ; then
|
||||
if [ -n "$contains" ]; then
|
||||
if [[ ! $body =~ $contains ]]; then
|
||||
echo "CRITICAL: body does not contain '${contains}'. Body: '$(echo $body | sed 's/\(.\{50\}\).*/\1.../')' |time=$((end - start))ms;${warning};${critical};0;"$critical"ms"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
if [ -n "$lacks" ]; then
|
||||
if [[ $body == *$lacks* ]]; then
|
||||
echo "CRITICAL: body contains '${lacks}'|time=$((end - start))ms;${warning};${critical};0;"$critical"ms"
|
||||
exit 2
|
||||
fi
|
||||
fi
|
||||
echo "$(checkTime $((end - start))) $((end - start))ms - ${url}|time=$((end - start))ms;${warning};${critical};0;"$critical"ms"
|
||||
getStatus $((end - start))
|
||||
exit $?
|
||||
else
|
||||
case $status in
|
||||
1)
|
||||
echo "CRITICAL: Unsupported protocol"
|
||||
;;
|
||||
3)
|
||||
echo "CRITICAL: Malformed URL"
|
||||
;;
|
||||
5)
|
||||
echo "CRITICAL: Couldn't resolve proxy $proxy"
|
||||
;;
|
||||
6)
|
||||
echo "CRITICAL: Couldn't resolve host"
|
||||
;;
|
||||
7)
|
||||
echo "CRITICAL: Couldn't connect to proxy $proxy"
|
||||
;;
|
||||
22)
|
||||
echo "CRITICAL: Server returned http code >= 400"
|
||||
;;
|
||||
52)
|
||||
echo "CRITICAL: Server returned empty response (52)"
|
||||
;;
|
||||
56)
|
||||
echo "CRITICAL: Failure recieving network data (56)"
|
||||
;;
|
||||
60)
|
||||
echo "CRITICAL: SSL/TLS connection problem (60)"
|
||||
;;
|
||||
*)
|
||||
echo "UNKNOWN: $status - ${url}"
|
||||
exit 3
|
||||
;;
|
||||
esac
|
||||
exit 2
|
||||
fi
|
||||
|
|
@ -244,7 +244,7 @@ async def main() -> None:
|
|||
|
||||
for x in prints:
|
||||
print(f'\n{x}', end=' ')
|
||||
print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;")
|
||||
print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot1_output_msg}s;;;")
|
||||
|
||||
sys.exit(nagios_output)
|
||||
|
||||
|
|
|
@ -179,7 +179,7 @@ async def main() -> None:
|
|||
exit_code = nagios.CRITICAL
|
||||
prints.append(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}")
|
||||
else:
|
||||
prints.append(f'OK: is not redirected.')
|
||||
prints.append(f'OK: was not redirected.')
|
||||
|
||||
if args.required_headers:
|
||||
# Icinga may pass the values as one string
|
||||
|
@ -192,11 +192,11 @@ async def main() -> None:
|
|||
if code > exit_code:
|
||||
exit_code = code
|
||||
|
||||
results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')]
|
||||
for header_chk, code in results:
|
||||
prints.append(header_chk)
|
||||
if code > exit_code:
|
||||
exit_code = code
|
||||
# results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')]
|
||||
# for header_chk, code in results:
|
||||
# prints.append(header_chk)
|
||||
# if code > exit_code:
|
||||
# exit_code = code
|
||||
|
||||
clean_msg = await cleanup(client, test_image_path, image_event_id=image_event_id)
|
||||
|
||||
|
|
|
@ -54,9 +54,11 @@ def main():
|
|||
m = re.match(r'<span class="tooltip">\s*Send: (.*?)\s*<br\/>\s*Receive: (.*?)\s*<\/span>', str(item))
|
||||
if m:
|
||||
domain = item.parent.parent.find('span', {'class': 'domain'}).text
|
||||
s = ms_to_s(m.group(1))
|
||||
r = ms_to_s(m.group(2))
|
||||
data[domain] = {
|
||||
'send': ms_to_s(m.group(1)),
|
||||
'receive': ms_to_s(m.group(2)),
|
||||
'send': (s if s else -1),
|
||||
'receive': (r if r else -1),
|
||||
}
|
||||
exit_code = nagios.OK
|
||||
info_str = []
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
#!/usr/bin/env python3
|
||||
# check_nginx is a Nagios to monitor nginx status
|
||||
# The version is 1.0.2
|
||||
# fixed by Nikolay Kandalintsev (twitter: @nicloay)
|
||||
# Based on yangzi2008@126.com from http://www.nginxs.com
|
||||
# which available here http://exchange.nagios.org/directory/Plugins/Web-Servers/nginx/check_nginx/details
|
||||
|
||||
import getopt
|
||||
import string
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
import urllib
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
def usage():
|
||||
print("""check_nginx is a Nagios to monitor nginx status
|
||||
Usage:
|
||||
|
||||
check_nginx [-h|--help][-U|--url][-P|--path][-u|--user][-p|--passwd][-w|--warning][-c|--critical]
|
||||
|
||||
Options:
|
||||
--help|-h)
|
||||
print check_nginx help.
|
||||
--url|-U)
|
||||
Sets nginx status url.
|
||||
--path|-P)
|
||||
Sets nginx status url path. Default is: off
|
||||
--user|-u)
|
||||
Sets nginx status BasicAuth user. Default is: off
|
||||
--passwd|-p)
|
||||
Sets nginx status BasicAuth passwd. Default is: off
|
||||
--warning|-w)
|
||||
Sets a warning level for nginx Active connections. Default is: off
|
||||
--critical|-c)
|
||||
Sets a critical level for nginx Active connections. Default is: off
|
||||
Example:
|
||||
The url is www.nginxs.com/status
|
||||
./check_nginx -U www.nginxs.com -P /status -u eric -p nginx -w 1000 -c 2000
|
||||
if dont't have password:
|
||||
./check_nginx -U www.nginxs.com -P /status -w 1000 -c 2000
|
||||
if don't have path and password:
|
||||
./check_nginx -U www.nginxs.com -w 1000 -c 2000""")
|
||||
|
||||
sys.exit(3)
|
||||
|
||||
|
||||
try:
|
||||
options, args = getopt.getopt(sys.argv[1:], "hU:P:u:p:w:c:", ["help", "url=", "path=", "user=", "passwd=", "warning=", "critical="])
|
||||
|
||||
except getopt.GetoptError:
|
||||
usage()
|
||||
sys.exit(3)
|
||||
|
||||
for name, value in options:
|
||||
if name in ("-h", "--help"):
|
||||
usage()
|
||||
if name in ("-U", "--url"):
|
||||
url = "http://" + value
|
||||
if name in ("-P", "--path"):
|
||||
path = value
|
||||
if name in ("-u", "--user"):
|
||||
user = value
|
||||
if name in ("-p", "--passwd"):
|
||||
passwd = value
|
||||
if name in ("-w", "--warning"):
|
||||
warning = value
|
||||
if name in ("-c", "--critical"):
|
||||
critical = value
|
||||
try:
|
||||
if 'path' in dir():
|
||||
req = urllib.Request(url + path)
|
||||
else:
|
||||
req = urllib.Request(url)
|
||||
if 'user' in dir() and 'passwd' in dir():
|
||||
passman = urllib.HTTPPasswordMgrWithDefaultRealm()
|
||||
passman.add_password(None, url + path, user, passwd)
|
||||
authhandler = urllib.HTTPBasicAuthHandler(passman)
|
||||
opener = urllib.build_opener(authhandler)
|
||||
urllib.install_opener(opener)
|
||||
response = urlopen(req)
|
||||
the_page = response.readline()
|
||||
conn = the_page.split()
|
||||
ActiveConn = conn[2]
|
||||
the_page1 = response.readline()
|
||||
the_page2 = response.readline()
|
||||
the_page3 = response.readline()
|
||||
response.close()
|
||||
b = the_page3.split()
|
||||
reading = b[1]
|
||||
writing = b[3]
|
||||
waiting = b[5]
|
||||
output = 'ActiveConn:%s,reading:%s,writing:%s,waiting:%s' % (ActiveConn, reading, writing, waiting)
|
||||
perfdata = 'ActiveConn=%s;reading=%s;writing=%s;waiting=%s' % (ActiveConn, reading, writing, waiting)
|
||||
|
||||
except Exception:
|
||||
print("NGINX STATUS unknown: Error while getting Connection")
|
||||
print(traceback.format_exc())
|
||||
sys.exit(3)
|
||||
if 'warning' in dir() and 'critical' in dir():
|
||||
if int(ActiveConn) >= int(critical):
|
||||
print('CRITICAL - %s|%s' % (output, perfdata))
|
||||
sys.exit(2)
|
||||
elif int(ActiveConn) >= int(warning):
|
||||
print('WARNING - %s|%s' % (output, perfdata))
|
||||
sys.exit(1)
|
||||
else:
|
||||
print('OK - %s|%s' % (output, perfdata))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print('OK - %s|%s' % (output, perfdata))
|
||||
sys.exit(0)
|
|
@ -0,0 +1,10 @@
|
|||
FROM python:3
|
||||
|
||||
ADD check_pve.py /
|
||||
ADD requirements.txt /
|
||||
RUN apt-get update
|
||||
RUN apt install -y python3 python3-requests python3-packaging
|
||||
RUN pip3 install -r requirements.txt
|
||||
|
||||
|
||||
CMD ["tail", "-f", "/dev/null"]
|
|
@ -0,0 +1,339 @@
|
|||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 2, June 1991
|
||||
|
||||
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
License is intended to guarantee your freedom to share and change free
|
||||
software--to make sure the software is free for all its users. This
|
||||
General Public License applies to most of the Free Software
|
||||
Foundation's software and to any other program whose authors commit to
|
||||
using it. (Some other Free Software Foundation software is covered by
|
||||
the GNU Lesser General Public License instead.) You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
this service if you wish), that you receive source code or can get it
|
||||
if you want it, that you can change the software or use pieces of it
|
||||
in new free programs; and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
anyone to deny you these rights or to ask you to surrender the rights.
|
||||
These restrictions translate to certain responsibilities for you if you
|
||||
distribute copies of the software, or if you modify it.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must give the recipients all the rights that
|
||||
you have. You must make sure that they, too, receive or can get the
|
||||
source code. And you must show them these terms so they know their
|
||||
rights.
|
||||
|
||||
We protect your rights with two steps: (1) copyright the software, and
|
||||
(2) offer you this license which gives you legal permission to copy,
|
||||
distribute and/or modify the software.
|
||||
|
||||
Also, for each author's protection and ours, we want to make certain
|
||||
that everyone understands that there is no warranty for this free
|
||||
software. If the software is modified by someone else and passed on, we
|
||||
want its recipients to know that what they have is not the original, so
|
||||
that any problems introduced by others will not reflect on the original
|
||||
authors' reputations.
|
||||
|
||||
Finally, any free program is threatened constantly by software
|
||||
patents. We wish to avoid the danger that redistributors of a free
|
||||
program will individually obtain patent licenses, in effect making the
|
||||
program proprietary. To prevent this, we have made it clear that any
|
||||
patent must be licensed for everyone's free use or not licensed at all.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
GNU GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License applies to any program or other work which contains
|
||||
a notice placed by the copyright holder saying it may be distributed
|
||||
under the terms of this General Public License. The "Program", below,
|
||||
refers to any such program or work, and a "work based on the Program"
|
||||
means either the Program or any derivative work under copyright law:
|
||||
that is to say, a work containing the Program or a portion of it,
|
||||
either verbatim or with modifications and/or translated into another
|
||||
language. (Hereinafter, translation is included without limitation in
|
||||
the term "modification".) Each licensee is addressed as "you".
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running the Program is not restricted, and the output from the Program
|
||||
is covered only if its contents constitute a work based on the
|
||||
Program (independent of having been made by running the Program).
|
||||
Whether that is true depends on what the Program does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Program's
|
||||
source code as you receive it, in any medium, provided that you
|
||||
conspicuously and appropriately publish on each copy an appropriate
|
||||
copyright notice and disclaimer of warranty; keep intact all the
|
||||
notices that refer to this License and to the absence of any warranty;
|
||||
and give any other recipients of the Program a copy of this License
|
||||
along with the Program.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy, and
|
||||
you may at your option offer warranty protection in exchange for a fee.
|
||||
|
||||
2. You may modify your copy or copies of the Program or any portion
|
||||
of it, thus forming a work based on the Program, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) You must cause the modified files to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
b) You must cause any work that you distribute or publish, that in
|
||||
whole or in part contains or is derived from the Program or any
|
||||
part thereof, to be licensed as a whole at no charge to all third
|
||||
parties under the terms of this License.
|
||||
|
||||
c) If the modified program normally reads commands interactively
|
||||
when run, you must cause it, when started running for such
|
||||
interactive use in the most ordinary way, to print or display an
|
||||
announcement including an appropriate copyright notice and a
|
||||
notice that there is no warranty (or else, saying that you provide
|
||||
a warranty) and that users may redistribute the program under
|
||||
these conditions, and telling the user how to view a copy of this
|
||||
License. (Exception: if the Program itself is interactive but
|
||||
does not normally print such an announcement, your work based on
|
||||
the Program is not required to print an announcement.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Program,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Program, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Program.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Program
|
||||
with the Program (or with a work based on the Program) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may copy and distribute the Program (or a work based on it,
|
||||
under Section 2) in object code or executable form under the terms of
|
||||
Sections 1 and 2 above provided that you also do one of the following:
|
||||
|
||||
a) Accompany it with the complete corresponding machine-readable
|
||||
source code, which must be distributed under the terms of Sections
|
||||
1 and 2 above on a medium customarily used for software interchange; or,
|
||||
|
||||
b) Accompany it with a written offer, valid for at least three
|
||||
years, to give any third party, for a charge no more than your
|
||||
cost of physically performing source distribution, a complete
|
||||
machine-readable copy of the corresponding source code, to be
|
||||
distributed under the terms of Sections 1 and 2 above on a medium
|
||||
customarily used for software interchange; or,
|
||||
|
||||
c) Accompany it with the information you received as to the offer
|
||||
to distribute corresponding source code. (This alternative is
|
||||
allowed only for noncommercial distribution and only if you
|
||||
received the program in object code or executable form with such
|
||||
an offer, in accord with Subsection b above.)
|
||||
|
||||
The source code for a work means the preferred form of the work for
|
||||
making modifications to it. For an executable work, complete source
|
||||
code means all the source code for all modules it contains, plus any
|
||||
associated interface definition files, plus the scripts used to
|
||||
control compilation and installation of the executable. However, as a
|
||||
special exception, the source code distributed need not include
|
||||
anything that is normally distributed (in either source or binary
|
||||
form) with the major components (compiler, kernel, and so on) of the
|
||||
operating system on which the executable runs, unless that component
|
||||
itself accompanies the executable.
|
||||
|
||||
If distribution of executable or object code is made by offering
|
||||
access to copy from a designated place, then offering equivalent
|
||||
access to copy the source code from the same place counts as
|
||||
distribution of the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
4. You may not copy, modify, sublicense, or distribute the Program
|
||||
except as expressly provided under this License. Any attempt
|
||||
otherwise to copy, modify, sublicense or distribute the Program is
|
||||
void, and will automatically terminate your rights under this License.
|
||||
However, parties who have received copies, or rights, from you under
|
||||
this License will not have their licenses terminated so long as such
|
||||
parties remain in full compliance.
|
||||
|
||||
5. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Program or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Program (or any work based on the
|
||||
Program), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Program or works based on it.
|
||||
|
||||
6. Each time you redistribute the Program (or any work based on the
|
||||
Program), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute or modify the Program subject to
|
||||
these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties to
|
||||
this License.
|
||||
|
||||
7. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Program at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Program by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Program.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under
|
||||
any particular circumstance, the balance of the section is intended to
|
||||
apply and the section as a whole is intended to apply in other
|
||||
circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system, which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
8. If the distribution and/or use of the Program is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Program under this License
|
||||
may add an explicit geographical distribution limitation excluding
|
||||
those countries, so that distribution is permitted only in or among
|
||||
countries not thus excluded. In such case, this License incorporates
|
||||
the limitation as if written in the body of this License.
|
||||
|
||||
9. The Free Software Foundation may publish revised and/or new versions
|
||||
of the General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Program
|
||||
specifies a version number of this License which applies to it and "any
|
||||
later version", you have the option of following the terms and conditions
|
||||
either of that version or of any later version published by the Free
|
||||
Software Foundation. If the Program does not specify a version number of
|
||||
this License, you may choose any version ever published by the Free Software
|
||||
Foundation.
|
||||
|
||||
10. If you wish to incorporate parts of the Program into other free
|
||||
programs whose distribution conditions are different, write to the author
|
||||
to ask for permission. For software which is copyrighted by the Free
|
||||
Software Foundation, write to the Free Software Foundation; we sometimes
|
||||
make exceptions for this. Our decision will be guided by the two goals
|
||||
of preserving the free status of all derivatives of our free software and
|
||||
of promoting the sharing and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
||||
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
||||
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
||||
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
||||
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
||||
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
||||
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
||||
REPAIR OR CORRECTION.
|
||||
|
||||
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
||||
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
||||
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
||||
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
||||
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
||||
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
||||
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program is interactive, make it output a short notice like this
|
||||
when it starts in an interactive mode:
|
||||
|
||||
Gnomovision version 69, Copyright (C) year name of author
|
||||
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, the commands you use may
|
||||
be called something other than `show w' and `show c'; they could even be
|
||||
mouse-clicks or menu items--whatever suits your program.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the program, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
||||
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1989
|
||||
Ty Coon, President of Vice
|
||||
|
||||
This General Public License does not permit incorporating your program into
|
||||
proprietary programs. If your program is a subroutine library, you may
|
||||
consider it more useful to permit linking proprietary applications with the
|
||||
library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License.
|
|
@ -0,0 +1,304 @@
|
|||
# check_pve
|
||||
Icinga check command for Proxmox VE via API
|
||||
|
||||
## Setup
|
||||
|
||||
### Requirements
|
||||
|
||||
This check command depends on **Python 3** and the following modules:
|
||||
* requests
|
||||
* argparse
|
||||
* packaging
|
||||
|
||||
**Installation on Debian / Ubuntu**
|
||||
```
|
||||
apt install python3 python3-requests python3-packaging
|
||||
```
|
||||
|
||||
**Installation on Redhat 7 / CentOS 7**
|
||||
```
|
||||
yum install python36 python36-requests python36-packaging
|
||||
```
|
||||
|
||||
**Installation on FreeBSD**
|
||||
```
|
||||
pkg install python3 py39-requests py39-packaging
|
||||
```
|
||||
|
||||
**Installation from requirements file**
|
||||
```
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
**Installation as Docker container**
|
||||
```
|
||||
docker build -t check_pve .
|
||||
```
|
||||
After this, you can start the container like so:
|
||||
```
|
||||
docker run -d --name check_pve --rm check_pve
|
||||
```
|
||||
The container will keep running without having the need for any of the requirements listed above (for environments that do not support this).
|
||||
Running a check is as simple as:
|
||||
```
|
||||
docker exec check_pve python check_pve.py ....rest of the default arguments listed below....
|
||||
```
|
||||
|
||||
### Create a API user in Proxmox VE
|
||||
|
||||
Create a role named ``Monitoring`` and assign necessary privileges:
|
||||
|
||||
```
|
||||
pveum roleadd Monitoring
|
||||
pveum rolemod Monitoring --privs VM.Monitor,Sys.Audit,Datastore.Audit,VM.Audit
|
||||
```
|
||||
|
||||
Create a user named ``monitoring`` and set password:
|
||||
|
||||
```
|
||||
pveum useradd monitoring@pve --comment "The ICINGA 2 monitoring user"
|
||||
```
|
||||
|
||||
#### Use token based authorization (recommended)
|
||||
|
||||
Create an API token named `monitoring` for the user `monitoring`:
|
||||
|
||||
```
|
||||
pveum user token add monitoring@pve monitoring
|
||||
```
|
||||
|
||||
Please save the token secret as there isn't any way to fetch it at a later point.
|
||||
|
||||
Assign role `monitoring` to token `monitoring` and the user `monitoring@pve`:
|
||||
|
||||
```
|
||||
pveum acl modify / --roles Monitoring --user 'monitoring@pve'
|
||||
pveum acl modify / --roles Monitoring --tokens 'monitoring@pve!monitoring'
|
||||
```
|
||||
|
||||
|
||||
#### Use password based authorization
|
||||
|
||||
Set password for the user `monitoring`:
|
||||
|
||||
```
|
||||
pveum passwd monitoring@pve
|
||||
```
|
||||
|
||||
Assign ``monitoring`` role to user ``monitoring``
|
||||
|
||||
```
|
||||
pveum acl modify / --users monitoring@pve --roles Monitoring
|
||||
```
|
||||
|
||||
For further information about the Proxmox VE privilege system have a look into the [documentation](https://pve.proxmox.com/pve-docs/pve-admin-guide.html#_strong_pveum_strong_proxmox_ve_user_manager).
|
||||
|
||||
|
||||
## Usage
|
||||
|
||||
The ``icinga2`` folder contains the command definition and service examples for use with Icinga2.
|
||||
|
||||
```
|
||||
usage: check_pve.py [-h] -e API_ENDPOINT [--api-port API_PORT] -u API_USER (-p API_PASSWORD | -t API_TOKEN) [-k] -m
|
||||
{cluster,version,cpu,memory,swap,storage,io_wait,updates,services,subscription,vm,vm_status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation} [-n NODE] [--name NAME] [--vmid VMID]
|
||||
[--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] [--unit {GB,MB,KB,GiB,MiB,KiB,B}]
|
||||
|
||||
Check command for PVE hosts via API
|
||||
|
||||
options:
|
||||
-h, --help show this help message and exit
|
||||
|
||||
API Options:
|
||||
-e API_ENDPOINT, --api-endpoint API_ENDPOINT
|
||||
PVE api endpoint hostname
|
||||
--api-port API_PORT PVE api endpoint port
|
||||
-u API_USER, --username API_USER
|
||||
PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you have chosen in proxmox)
|
||||
-p API_PASSWORD, --password API_PASSWORD
|
||||
PVE API user password
|
||||
-t API_TOKEN, --api-token API_TOKEN
|
||||
PVE API token (format: TOKEN_ID=TOKEN_SECRET
|
||||
-k, --insecure Don't verify HTTPS certificate
|
||||
|
||||
Check Options:
|
||||
-m {cluster,version,cpu,memory,swap,storage,io_wait,updates,services,subscription,vm,vm_status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation}, --mode {cluster,version,cpu,memory,swap,storage,io_wait,updates,services,subscription,vm,vm_status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation}
|
||||
Mode to use.
|
||||
-n NODE, --node NODE Node to check (necessary for all modes except cluster and version)
|
||||
--name NAME Name of storage, vm, or container
|
||||
--vmid VMID ID of virtual machine or container
|
||||
--expected-vm-status {running,stopped,paused}
|
||||
Expected VM status
|
||||
--ignore-vm-status Ignore VM status in checks
|
||||
--ignore-service NAME
|
||||
Ignore service NAME in checks
|
||||
--ignore-disk NAME Ignore disk NAME in health check
|
||||
-w THRESHOLD_WARNING, --warning THRESHOLD_WARNING
|
||||
Warning threshold for check value. Mutiple thresholds with name:value,name:value
|
||||
-c THRESHOLD_CRITICAL, --critical THRESHOLD_CRITICAL
|
||||
Critical threshold for check value Mutiple thresholds with name:value,name:value
|
||||
-M Values are shown in the unit which is set with --unit (if available). Thresholds are also treated in this unit
|
||||
-V MIN_VERSION, --min-version MIN_VERSION
|
||||
The minimal pve version to check for. Any version lower than this will return CRITICAL.
|
||||
--unit {GB,MB,KB,GiB,MiB,KiB,B}
|
||||
Unit which is used for performance data and other values
|
||||
|
||||
|
||||
```
|
||||
|
||||
## Check examples
|
||||
|
||||
|
||||
**Check cluster health**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -t <API_TOKEN> -e <API_ENDPOINT> -m cluster
|
||||
OK - Cluster 'proxmox1' is healthy'
|
||||
```
|
||||
|
||||
**Check PVE version**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m version -V 5.0.0
|
||||
OK - Your pve instance version '5.2' (0fcd7879) is up to date
|
||||
```
|
||||
|
||||
**Check CPU load**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m cpu -n node1
|
||||
OK - CPU usage is 2.4%|usage=2.4%;;
|
||||
```
|
||||
|
||||
**Check memory usage**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m memory -n node1
|
||||
OK - Memory usage is 37.44%|usage=37.44%;; used=96544.72MB;;;257867.91
|
||||
```
|
||||
|
||||
**Check disk-health**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m disk-health -n node1
|
||||
OK - All disks are healthy|wearout_sdb=96%;; wearout_sdc=96%;; wearout_sdd=96%;; wearout_sde=96%;;
|
||||
```
|
||||
|
||||
**Check storage usage**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m storage -n node1 --name local
|
||||
OK - Storage usage is 54.23%|usage=54.23%;; used=128513.11MB;;;236980.36
|
||||
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m storage -n node1 --name vms-disx
|
||||
CRITICAL - Storage 'vms-disx' doesn't exist on node 'node01'
|
||||
```
|
||||
|
||||
**Check subscription status**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m subscription -n node1 -w 50 -c 10
|
||||
OK - Subscription of level 'Community' is valid until 2019-01-09
|
||||
```
|
||||
|
||||
**Check VM status**
|
||||
|
||||
Without specifying a node name:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm
|
||||
OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=8.33%;;
|
||||
```
|
||||
|
||||
You can also pass a container name for the VM check:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-lxc
|
||||
OK - LXC 'test-lxc' on node 'node1' is running|cpu=0.11%;; memory=13.99%;;
|
||||
```
|
||||
|
||||
With memory thresholds:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm -w 50 -c 80
|
||||
OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=40.33%;50.0;80.0
|
||||
```
|
||||
|
||||
With a specified node name, the check plugin verifies on which node the VM runs.
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm -n node1 --name test-vm
|
||||
OK - VM 'test-vm' is running on node 'node1'|cpu=1.85%;; memory=8.33%;;
|
||||
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm -n node1 --name test-vm
|
||||
WARNING - VM 'test-vm' is running on node 'node2' instead of 'node1'|cpu=1.85%;; memory=8.33%;;
|
||||
```
|
||||
|
||||
If you only want to gather metrics and don't care about the vm status add the ``--ignore-vm-status`` flag:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm --ignore-vm-status
|
||||
OK - VM 'test-vm' is not running
|
||||
```
|
||||
|
||||
Specify the expected VM status:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm --expected-vm-status stopped
|
||||
OK - VM 'test-vm' is not running
|
||||
|
||||
```
|
||||
|
||||
For hostalive checks without gathering performance data use ``vm_status`` instead of ``vm``. The parameters are the same as with ``vm``.
|
||||
|
||||
**Check swap usage**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m swap -n pve
|
||||
OK - Swap usage is 0.0 %|usage=0.0%;; used=0.0MB;;;8192.0
|
||||
```
|
||||
|
||||
**Check storage replication status**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m replication -n node1
|
||||
OK - No failed replication jobs on node1
|
||||
```
|
||||
|
||||
**Check ceph cluster health**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m ceph-health
|
||||
WARNING - Ceph Cluster is in warning state
|
||||
```
|
||||
|
||||
**Check ZFS pool health**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve
|
||||
OK - All ZFS pools are healthy
|
||||
```
|
||||
|
||||
Check for specific pool:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve --name rpool
|
||||
OK - ZFS pool 'rpool' is healthy
|
||||
```
|
||||
|
||||
**Check ZFS pool fragmentation**
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-fragmentation -n pve -w 40 -c 60
|
||||
CRITICAL - 2 of 2 ZFS pools are above fragmentation thresholds:
|
||||
|
||||
- rpool (71 %) is CRITICAL
|
||||
- diskpool (50 %) is WARNING
|
||||
|fragmentation_diskpool=50%;40.0;60.0 fragmentation_rpool=71%;40.0;60.0
|
||||
|
||||
```
|
||||
|
||||
Check for specific pool:
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-fragmentation -n pve --name diskpool -w 40 -c 60
|
||||
WARNING - Fragmentation of ZFS pool 'diskpool' is above thresholds: 50 %|fragmentation=50%;40.0;60.0
|
||||
```
|
||||
|
||||
## FAQ
|
||||
|
||||
### Individual thresholds per metric
|
||||
|
||||
You can either specify a threshold for warning or critical which is applied to all metrics or define individual thresholds like this (`name:value,name:value,...`):
|
||||
|
||||
```
|
||||
./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm -w memory:50 -c cpu:50,memory:80
|
||||
OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;50.0; memory=40.33%;50.0;80.0
|
||||
```
|
||||
|
||||
### Could not connect to PVE API: Failed to resolve hostname
|
||||
|
||||
Verify that your DNS server is working and can resolve your hostname. If everything is fine check for proxyserver environment variables (HTTP_PROXY,HTTPS_PROXY), which maybe not allow communication to port 8006.
|
||||
|
||||
## Contributors
|
||||
|
||||
Thank you to everyone, who is contributing to `check_pve`: https://github.com/nbuchwitz/check_pve/graphs/contributors.
|
|
@ -0,0 +1,819 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# check_pve.py - A check plugin for Proxmox Virtual Environment (PVE).
|
||||
# Copyright (C) 2018-2022 Nicolai Buchwitz <nb@tipi-net.de>
|
||||
#
|
||||
# Version: 1.2.2
|
||||
#
|
||||
# ------------------------------------------------------------------------------
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
try:
|
||||
from enum import Enum
|
||||
from datetime import datetime
|
||||
from packaging import version
|
||||
import argparse
|
||||
import requests
|
||||
|
||||
except ImportError as e:
|
||||
print("Missing python module: {}".format(str(e)))
|
||||
sys.exit(255)
|
||||
|
||||
|
||||
class CheckState(Enum):
|
||||
OK = 0
|
||||
WARNING = 1
|
||||
CRITICAL = 2
|
||||
UNKNOWN = 3
|
||||
|
||||
|
||||
class CheckThreshold:
|
||||
def __init__(self, value: float):
|
||||
self.value = value
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.value == other.value
|
||||
|
||||
def __lt__(self, other):
|
||||
return self.value < other.value
|
||||
|
||||
def __le__(self, other):
|
||||
return self.value <= other.value
|
||||
|
||||
def __gt__(self, other):
|
||||
return self.value > other.value
|
||||
|
||||
def __ge__(self, other):
|
||||
return self.value >= other.value
|
||||
|
||||
def check(self, value: float, lower: bool = False):
|
||||
if lower:
|
||||
return value < self.value
|
||||
else:
|
||||
return value > self.value
|
||||
|
||||
@staticmethod
|
||||
def threshold_type(arg: str):
|
||||
thresholds = {}
|
||||
|
||||
try:
|
||||
thresholds[None] = CheckThreshold(float(arg))
|
||||
except:
|
||||
for t in arg.split(','):
|
||||
m = re.match("([a-z_0-9]+):([0-9.]+)", t)
|
||||
|
||||
if m:
|
||||
thresholds[m.group(1)] = CheckThreshold(float(m.group(2)))
|
||||
else:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"invalid threshold format: {}".format(t))
|
||||
|
||||
return thresholds
|
||||
|
||||
|
||||
class CheckPVE:
|
||||
VERSION = '1.2.2'
|
||||
API_URL = 'https://{hostname}:{port}/api2/json/{command}'
|
||||
UNIT_SCALE = {
|
||||
"GB": 10**9,
|
||||
"MB": 10**6,
|
||||
"KB": 10**3,
|
||||
"GiB": 2**30,
|
||||
"MiB": 2**20,
|
||||
"KiB": 2**10,
|
||||
"B": 1
|
||||
}
|
||||
|
||||
def check_output(self):
|
||||
message = self.check_message
|
||||
if self.perfdata:
|
||||
message += self.get_perfdata()
|
||||
|
||||
self.output(self.check_result, message)
|
||||
|
||||
@staticmethod
|
||||
def output(rc, message):
|
||||
prefix = rc.name
|
||||
message = '{} - {}'.format(prefix, message)
|
||||
|
||||
print(message)
|
||||
sys.exit(rc.value)
|
||||
|
||||
def get_url(self, command):
|
||||
return self.API_URL.format(hostname=self.options.api_endpoint, command=command, port=self.options.api_port)
|
||||
|
||||
def request(self, url, method='get', **kwargs):
|
||||
response = None
|
||||
try:
|
||||
if method == 'post':
|
||||
response = requests.post(
|
||||
url,
|
||||
verify=not self.options.api_insecure,
|
||||
data=kwargs.get('data', None),
|
||||
timeout=5
|
||||
)
|
||||
elif method == 'get':
|
||||
response = requests.get(
|
||||
url,
|
||||
verify=not self.options.api_insecure,
|
||||
cookies=self.__cookies,
|
||||
headers=self.__headers,
|
||||
params=kwargs.get('params', None),
|
||||
)
|
||||
else:
|
||||
self.output(CheckState.CRITICAL, "Unsupport request method: {}".format(method))
|
||||
except requests.exceptions.ConnectTimeout:
|
||||
self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Connection timeout")
|
||||
except requests.exceptions.SSLError:
|
||||
self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Certificate validation failed")
|
||||
except requests.exceptions.ConnectionError:
|
||||
self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Failed to resolve hostname")
|
||||
|
||||
if response.ok:
|
||||
return response.json()['data']
|
||||
else:
|
||||
message = "Could not fetch data from API: "
|
||||
|
||||
if response.status_code == 401:
|
||||
message += "Could not connection to PVE API: invalid username or password"
|
||||
elif response.status_code == 403:
|
||||
message += "Access denied. Please check if API user has sufficient permissions / the role has been " \
|
||||
"assigned."
|
||||
else:
|
||||
message += "HTTP error code was {}".format(response.status_code)
|
||||
|
||||
self.output(CheckState.UNKNOWN, message)
|
||||
|
||||
def get_ticket(self):
|
||||
url = self.get_url('access/ticket')
|
||||
data = {"username": self.options.api_user, "password": self.options.api_password}
|
||||
result = self.request(url, "post", data=data)
|
||||
|
||||
return result['ticket']
|
||||
|
||||
def check_api_value(self, url, message, **kwargs):
|
||||
result = self.request(url)
|
||||
used = None
|
||||
|
||||
if 'key' in kwargs:
|
||||
result = result[kwargs.get('key')]
|
||||
|
||||
if isinstance(result, (dict,)):
|
||||
used_percent = self.get_value(result['used'], result['total'])
|
||||
used = self.get_value(result['used'])
|
||||
total = self.get_value(result['total'])
|
||||
|
||||
self.add_perfdata(kwargs.get('perfkey', 'usage'), used_percent)
|
||||
self.add_perfdata(kwargs.get('perfkey', 'used'), used, max=total, unit=self.options.unit)
|
||||
else:
|
||||
used_percent = round(float(result) * 100, 2)
|
||||
self.add_perfdata(kwargs.get('perfkey', 'usage'), used_percent)
|
||||
|
||||
if self.options.values_mb:
|
||||
message += ' {} {}'.format(used, self.options.unit)
|
||||
value = used
|
||||
else:
|
||||
message += ' {} {}'.format(used_percent, '%')
|
||||
value = used_percent
|
||||
|
||||
self.check_thresholds(value, message)
|
||||
|
||||
def check_vm_status(self, idx, **kwargs):
|
||||
url = self.get_url('cluster/resources', )
|
||||
data = self.request(url, params={'type': 'vm'})
|
||||
|
||||
expected_state = kwargs.get("expected_state", "running")
|
||||
only_status = kwargs.get("only_status", False)
|
||||
|
||||
found = False
|
||||
for vm in data:
|
||||
if vm['name'] == idx or vm['vmid'] == idx:
|
||||
# Check if VM (default) or LXC
|
||||
vm_type = "VM"
|
||||
if vm['type'] == 'lxc':
|
||||
vm_type = "LXC"
|
||||
|
||||
if vm['status'] != expected_state:
|
||||
self.check_message = "{} '{}' is {} (expected: {})".format(vm_type, vm['name'], vm['status'],
|
||||
expected_state)
|
||||
if not self.options.ignore_vm_status:
|
||||
self.check_result = CheckState.CRITICAL
|
||||
else:
|
||||
if self.options.node and self.options.node != vm['node']:
|
||||
self.check_message = "{} '{}' is {}, but located on node '{}' instead of '{}'" \
|
||||
.format(vm_type, vm['name'], expected_state, vm['node'], self.options.node)
|
||||
self.check_result = CheckState.WARNING
|
||||
else:
|
||||
self.check_message = "{} '{}' is {} on node '{}'" \
|
||||
.format(vm_type, vm['name'], expected_state, vm['node'])
|
||||
|
||||
if vm['status'] == 'running' and not only_status:
|
||||
cpu = round(vm['cpu'] * 100, 2)
|
||||
self.add_perfdata("cpu", cpu)
|
||||
|
||||
if self.options.values_mb:
|
||||
memory = self.scale_value(vm['mem'])
|
||||
self.add_perfdata("memory", memory, unit=self.options.unit, max=self.scale_value(vm['maxmem']))
|
||||
|
||||
else:
|
||||
memory = self.get_value(vm['mem'], vm['maxmem'])
|
||||
self.add_perfdata("memory", memory)
|
||||
|
||||
self.check_thresholds({"cpu": cpu, "memory": memory}, message=self.check_message)
|
||||
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
self.check_message = "VM or LXC '{}' not found".format(idx)
|
||||
self.check_result = CheckState.WARNING
|
||||
|
||||
def check_disks(self):
|
||||
url = self.get_url('nodes/{}/disks'.format(self.options.node))
|
||||
|
||||
failed = []
|
||||
unknown = []
|
||||
disks = self.request(url + '/list')
|
||||
for disk in disks:
|
||||
name = disk['devpath'].replace('/dev/', '')
|
||||
|
||||
if name in self.options.ignore_disks:
|
||||
continue
|
||||
|
||||
if disk['health'] == 'UNKNOWN':
|
||||
self.check_result = CheckState.WARNING
|
||||
unknown.append({"serial": disk["serial"], "device": disk['devpath']})
|
||||
|
||||
elif disk['health'] not in ('PASSED', 'OK'):
|
||||
self.check_result = CheckState.WARNING
|
||||
failed.append({"serial": disk["serial"], "device": disk['devpath']})
|
||||
|
||||
if disk['wearout'] != 'N/A':
|
||||
self.add_perfdata('wearout_{}'.format(name), disk['wearout'])
|
||||
|
||||
if failed:
|
||||
self.check_message = "{} of {} disks failed the health test:\n".format(len(failed), len(disks))
|
||||
for disk in failed:
|
||||
self.check_message += "- {} with serial '{}'\n".format(disk['device'], disk['serial'])
|
||||
|
||||
if unknown:
|
||||
self.check_message += "{} of {} disks have unknown health status:\n".format(len(unknown), len(disks))
|
||||
for disk in unknown:
|
||||
self.check_message += "- {} with serial '{}'\n".format(disk['device'], disk['serial'])
|
||||
|
||||
if not failed and not unknown:
|
||||
self.check_message = "All disks are healthy"
|
||||
|
||||
def check_replication(self):
|
||||
url = self.get_url('nodes/{}/replication'.format(self.options.node))
|
||||
|
||||
if self.options.vmid:
|
||||
data = self.request(url, params={'guest': self.options.vmid})
|
||||
else:
|
||||
data = self.request(url)
|
||||
|
||||
failed_jobs = [] # format: [{guest: str, fail_count: int, error: str}]
|
||||
performance_data = []
|
||||
|
||||
for job in data:
|
||||
if job['fail_count'] > 0:
|
||||
failed_jobs.append({'guest': job['guest'], 'fail_count': job['fail_count'], 'error': job['error']})
|
||||
else:
|
||||
performance_data.append({'id': job['id'], 'duration': job['duration']})
|
||||
|
||||
if len(failed_jobs) > 0:
|
||||
message = "Failed replication jobs on {}: ".format(self.options.node)
|
||||
for job in failed_jobs:
|
||||
message = message + "GUEST: {j[guest]}, FAIL_COUNT: {j[fail_count]}, ERROR: {j[error]} ; ".format(j=job)
|
||||
self.check_message = message
|
||||
self.check_result = CheckState.WARNING
|
||||
else:
|
||||
self.check_message = "No failed replication jobs on {}".format(self.options.node)
|
||||
self.check_result = CheckState.OK
|
||||
|
||||
if len(performance_data) > 0:
|
||||
for metric in performance_data:
|
||||
self.add_perfdata('duration_' + metric['id'], metric['duration'], unit='s')
|
||||
|
||||
def check_services(self):
|
||||
url = self.get_url('nodes/{}/services'.format(self.options.node))
|
||||
data = self.request(url)
|
||||
|
||||
failed = {}
|
||||
for service in data:
|
||||
if service['state'] != 'running' \
|
||||
and service.get('active-state', 'active') == 'active' \
|
||||
and service['name'] not in self.options.ignore_services:
|
||||
failed[service['name']] = service['desc']
|
||||
|
||||
if failed:
|
||||
self.check_result = CheckState.CRITICAL
|
||||
message = "{} services are not running:\n\n".format(len(failed))
|
||||
message += "\n".join(['- {} ({}) is not running'.format(failed[i], i) for i in failed])
|
||||
self.check_message = message
|
||||
else:
|
||||
self.check_message = "All services are running"
|
||||
|
||||
def check_subscription(self):
|
||||
url = self.get_url('nodes/{}/subscription'.format(self.options.node))
|
||||
data = self.request(url)
|
||||
|
||||
if data['status'] == 'NotFound':
|
||||
self.check_result = CheckState.WARNING
|
||||
self.check_message = "No valid subscription found"
|
||||
if data['status'] == 'Inactive':
|
||||
self.check_result = CheckState.CRITICAL
|
||||
self.check_message = "Subscription expired"
|
||||
elif data['status'] == 'Active':
|
||||
subscription_due_date = data['nextduedate']
|
||||
subscription_product_name = data['productname']
|
||||
|
||||
date_expire = datetime.strptime(subscription_due_date, '%Y-%m-%d')
|
||||
date_today = datetime.today()
|
||||
delta = (date_expire - date_today).days
|
||||
|
||||
message = '{} is valid until {}'.format(
|
||||
subscription_product_name,
|
||||
subscription_due_date)
|
||||
message_warning_critical = '{} will expire in {} days ({})'.format(
|
||||
subscription_product_name,
|
||||
delta,
|
||||
subscription_due_date)
|
||||
|
||||
self.check_thresholds(delta, message, messageWarning=message_warning_critical,
|
||||
messageCritical=message_warning_critical, lowerValue=True)
|
||||
|
||||
def check_updates(self):
|
||||
url = self.get_url('nodes/{}/apt/update'.format(self.options.node))
|
||||
count = len(self.request(url))
|
||||
|
||||
if count:
|
||||
self.check_result = CheckState.WARNING
|
||||
msg = "{} pending update"
|
||||
if count > 1:
|
||||
msg += "s"
|
||||
self.check_message = msg.format(count)
|
||||
else:
|
||||
self.check_message = "System up to date"
|
||||
|
||||
def check_cluster_status(self):
|
||||
url = self.get_url('cluster/status')
|
||||
data = self.request(url)
|
||||
|
||||
nodes = {}
|
||||
quorate = None
|
||||
cluster = ''
|
||||
for elem in data:
|
||||
if elem['type'] == 'cluster':
|
||||
quorate = elem['quorate']
|
||||
cluster = elem['name']
|
||||
elif elem['type'] == 'node':
|
||||
nodes[elem['name']] = elem['online']
|
||||
|
||||
if quorate is None:
|
||||
self.check_message = 'No cluster configuration found'
|
||||
elif quorate:
|
||||
node_count = len(nodes)
|
||||
nodes_online_count = len({k: v for k, v in nodes.items() if v})
|
||||
|
||||
if node_count > nodes_online_count:
|
||||
diff = node_count - nodes_online_count
|
||||
self.check_result = CheckState.WARNING
|
||||
self.check_message = "Cluster '{}' is healthy, but {} node(s) offline'".format(cluster, diff)
|
||||
else:
|
||||
self.check_message = "Cluster '{}' is healthy'".format(cluster)
|
||||
|
||||
self.add_perfdata('nodes_total', node_count, unit='')
|
||||
self.add_perfdata('nodes_online', nodes_online_count, unit='')
|
||||
else:
|
||||
self.check_result = CheckState.CRITICAL
|
||||
self.check_message = 'Cluster is unhealthy - no quorum'
|
||||
|
||||
def check_zfs_fragmentation(self, name=None):
|
||||
url = self.get_url('nodes/{}/disks/zfs'.format(self.options.node))
|
||||
data = self.request(url)
|
||||
|
||||
warnings = []
|
||||
critical = []
|
||||
found = name is None
|
||||
for pool in data:
|
||||
found = found or name == pool['name']
|
||||
if (name is not None and name == pool['name']) or name is None:
|
||||
key = "fragmentation"
|
||||
if name is None:
|
||||
key += '_{}'.format(pool['name'])
|
||||
self.add_perfdata(key, pool['frag'])
|
||||
|
||||
threshold_name = "fragmentation_{}".format(pool['name'])
|
||||
threshold_warning = self.threshold_warning(threshold_name)
|
||||
threshold_critical = self.threshold_critical(threshold_name)
|
||||
|
||||
if threshold_critical is not None and pool['frag'] > float(
|
||||
threshold_critical.value):
|
||||
critical.append(pool)
|
||||
elif threshold_warning is not None and pool['frag'] > float(
|
||||
threshold_warning.value):
|
||||
warnings.append(pool)
|
||||
|
||||
if not found:
|
||||
self.check_result = CheckState.UNKNOWN
|
||||
self.check_message = "Could not fetch fragmentation of ZFS pool '{}'".format(name)
|
||||
else:
|
||||
if warnings or critical:
|
||||
value = None
|
||||
if critical:
|
||||
self.check_result = CheckState.CRITICAL
|
||||
if name is not None:
|
||||
value = critical[0]['frag']
|
||||
else:
|
||||
self.check_result = CheckState.WARNING
|
||||
if name is not None:
|
||||
value = warnings[0]['frag']
|
||||
|
||||
if name is not None:
|
||||
self.check_message = "Fragmentation of ZFS pool '{}' is above thresholds: {} %".format(name, value)
|
||||
else:
|
||||
message = "{} of {} ZFS pools are above fragmentation thresholds:\n\n".format(
|
||||
len(warnings) + len(critical), len(data))
|
||||
message += "\n".join(
|
||||
['- {} ({} %) is CRITICAL\n'.format(pool['name'], pool['frag']) for pool in critical])
|
||||
message += "\n".join(
|
||||
['- {} ({} %) is WARNING\n'.format(pool['name'], pool['frag']) for pool in warnings])
|
||||
self.check_message = message
|
||||
else:
|
||||
self.check_result = CheckState.OK
|
||||
if name is not None:
|
||||
self.check_message = "Fragmentation of ZFS pool '{}' is OK".format(name)
|
||||
else:
|
||||
self.check_message = "Fragmentation of all ZFS pools is OK"
|
||||
|
||||
def check_zfs_health(self, name=None):
|
||||
url = self.get_url('nodes/{}/disks/zfs'.format(self.options.node))
|
||||
data = self.request(url)
|
||||
|
||||
unhealthy = []
|
||||
found = name is None
|
||||
healthy_conditions = ['online']
|
||||
for pool in data:
|
||||
found = found or name == pool['name']
|
||||
if (name is not None and name == pool['name']) or name is None:
|
||||
if pool['health'].lower() not in healthy_conditions:
|
||||
unhealthy.append(pool)
|
||||
|
||||
if not found:
|
||||
self.check_result = CheckState.UNKNOWN
|
||||
self.check_message = "Could not fetch health of ZFS pool '{}'".format(name)
|
||||
else:
|
||||
if unhealthy:
|
||||
self.check_result = CheckState.CRITICAL
|
||||
message = "{} ZFS pools are not healthy:\n\n".format(len(unhealthy))
|
||||
message += "\n".join(
|
||||
['- {} ({}) is not healthy'.format(pool['name'], pool['health']) for pool in unhealthy])
|
||||
self.check_message = message
|
||||
else:
|
||||
self.check_result = CheckState.OK
|
||||
if name is not None:
|
||||
self.check_message = "ZFS pool '{}' is healthy".format(name)
|
||||
else:
|
||||
self.check_message = "All ZFS pools are healthy"
|
||||
|
||||
def check_ceph_health(self):
|
||||
url = self.get_url('cluster/ceph/status')
|
||||
data = self.request(url)
|
||||
ceph_health = data.get('health', {})
|
||||
|
||||
if 'status' not in ceph_health:
|
||||
self.check_result = CheckState.UNKNOWN
|
||||
self.check_message = "Could not fetch Ceph status from API. " \
|
||||
"Check the output of 'pvesh get cluster/ceph' on your node"
|
||||
return
|
||||
|
||||
if ceph_health['status'] == 'HEALTH_OK':
|
||||
self.check_result = CheckState.OK
|
||||
self.check_message = "Ceph Cluster is healthy"
|
||||
elif ceph_health['status'] == 'HEALTH_WARN':
|
||||
self.check_result = CheckState.WARNING
|
||||
self.check_message = "Ceph Cluster is in warning state"
|
||||
elif ceph_health['status'] == 'HEALTH_CRIT':
|
||||
self.check_result = CheckState.CRITICAL
|
||||
self.check_message = "Ceph Cluster is in critical state"
|
||||
else:
|
||||
self.check_result = CheckState.UNKNOWN
|
||||
self.check_message = "Ceph Cluster is in unknown state"
|
||||
|
||||
def check_storage(self, name):
|
||||
# check if storage exists
|
||||
url = self.get_url('nodes/{}/storage'.format(self.options.node))
|
||||
data = self.request(url)
|
||||
|
||||
if not any(s['storage'] == name for s in data):
|
||||
self.check_result = CheckState.CRITICAL
|
||||
self.check_message = "Storage '{}' doesn't exist on node '{}'".format(name, self.options.node)
|
||||
return
|
||||
|
||||
url = self.get_url('nodes/{}/storage/{}/status'.format(self.options.node, name))
|
||||
self.check_api_value(url, "Usage of storage '{}' is".format(name))
|
||||
|
||||
def check_version(self):
|
||||
url = self.get_url('version')
|
||||
data = self.request(url)
|
||||
if not data['version']:
|
||||
self.check_result = CheckState.UNKNOWN
|
||||
self.check_message = "Unable to determine pve version"
|
||||
elif self.options.min_version and version.parse(self.options.min_version) > version.parse(data['version']):
|
||||
self.check_result = CheckState.CRITICAL
|
||||
self.check_message = "Current pve version '{}' ({}) is lower than the min. required version '{}'".format(
|
||||
data['version'], data['repoid'], self.options.min_version)
|
||||
else:
|
||||
self.check_message = "Your pve instance version '{}' ({}) is up to date".format(data['version'],
|
||||
data['repoid'])
|
||||
|
||||
def check_memory(self):
|
||||
url = self.get_url('nodes/{}/status'.format(self.options.node))
|
||||
self.check_api_value(url, 'Memory usage is', key='memory')
|
||||
|
||||
def check_swap(self):
|
||||
url = self.get_url('nodes/{}/status'.format(self.options.node))
|
||||
self.check_api_value(url, 'Swap usage is', key='swap')
|
||||
|
||||
def check_cpu(self):
|
||||
url = self.get_url('nodes/{}/status'.format(self.options.node))
|
||||
self.check_api_value(url, 'CPU usage is', key='cpu')
|
||||
|
||||
def check_io_wait(self):
|
||||
url = self.get_url('nodes/{}/status'.format(self.options.node))
|
||||
self.check_api_value(url, 'IO wait is', key='wait', perfkey='wait')
|
||||
|
||||
def check_thresholds(self, value, message, **kwargs):
|
||||
is_warning = False
|
||||
is_critical = False
|
||||
|
||||
if not isinstance(value, dict):
|
||||
value = { None: value }
|
||||
|
||||
for metric, value in value.items():
|
||||
value_warning = self.threshold_warning(metric)
|
||||
if value_warning is not None:
|
||||
is_warning = is_warning or value_warning.check(value, kwargs.get('lowerValue', False))
|
||||
|
||||
value_critical = self.threshold_critical(metric)
|
||||
if value_critical is not None:
|
||||
is_critical = is_critical or value_critical.check(value, kwargs.get('lowerValue', False))
|
||||
|
||||
if is_critical:
|
||||
self.check_result = CheckState.CRITICAL
|
||||
self.check_message = kwargs.get('messageCritical', message)
|
||||
elif is_warning:
|
||||
self.check_result = CheckState.WARNING
|
||||
self.check_message = kwargs.get('messageWarning', message)
|
||||
else:
|
||||
self.check_message = message
|
||||
|
||||
def scale_value(self, value):
|
||||
if self.options.unit in self.UNIT_SCALE:
|
||||
return value / self.UNIT_SCALE[self.options.unit]
|
||||
else:
|
||||
assert('wrong unit')
|
||||
|
||||
def threshold_warning(self, name: str):
|
||||
return self.options.threshold_warning.get(name, self.options.threshold_warning.get(None, None))
|
||||
|
||||
def threshold_critical(self, name: str):
|
||||
return self.options.threshold_critical.get(name, self.options.threshold_critical.get(None, None))
|
||||
|
||||
def get_value(self, value, total=None):
|
||||
value = float(value)
|
||||
|
||||
if total:
|
||||
value /= float(total) / 100
|
||||
else:
|
||||
value = self.scale_value(value)
|
||||
|
||||
return round(value, 2)
|
||||
|
||||
def add_perfdata(self, name, value, **kwargs):
|
||||
unit = kwargs.get('unit', '%')
|
||||
|
||||
perfdata = '{}={}{}'.format(name, value, unit)
|
||||
|
||||
threshold_warning = self.threshold_warning(name)
|
||||
threshold_critical = self.threshold_critical(name)
|
||||
|
||||
perfdata += ';'
|
||||
if threshold_warning:
|
||||
perfdata += str(threshold_warning.value)
|
||||
|
||||
perfdata += ';'
|
||||
if threshold_critical:
|
||||
perfdata += str(threshold_critical.value)
|
||||
|
||||
perfdata += ';{}'.format(kwargs.get('min', 0))
|
||||
perfdata += ';{}'.format(kwargs.get('max', ''))
|
||||
|
||||
self.perfdata.append(perfdata)
|
||||
|
||||
def get_perfdata(self):
|
||||
perfdata = ''
|
||||
|
||||
if len(self.perfdata):
|
||||
perfdata = '|'
|
||||
perfdata += ' '.join(self.perfdata)
|
||||
|
||||
return perfdata
|
||||
|
||||
def check(self):
|
||||
self.check_result = CheckState.OK
|
||||
|
||||
if self.options.mode == 'cluster':
|
||||
self.check_cluster_status()
|
||||
elif self.options.mode == 'version':
|
||||
self.check_version()
|
||||
elif self.options.mode == 'memory':
|
||||
self.check_memory()
|
||||
elif self.options.mode == 'swap':
|
||||
self.check_swap()
|
||||
elif self.options.mode == 'io_wait':
|
||||
self.check_io_wait()
|
||||
elif self.options.mode == 'disk-health':
|
||||
self.check_disks()
|
||||
elif self.options.mode == 'cpu':
|
||||
self.check_cpu()
|
||||
elif self.options.mode == 'services':
|
||||
self.check_services()
|
||||
elif self.options.mode == 'updates':
|
||||
self.check_updates()
|
||||
elif self.options.mode == 'subscription':
|
||||
self.check_subscription()
|
||||
elif self.options.mode == 'storage':
|
||||
self.check_storage(self.options.name)
|
||||
elif self.options.mode in ['vm', 'vm_status']:
|
||||
only_status = self.options.mode == 'vm_status'
|
||||
|
||||
if self.options.name:
|
||||
idx = self.options.name
|
||||
else:
|
||||
idx = self.options.vmid
|
||||
|
||||
if self.options.expected_vm_status:
|
||||
self.check_vm_status(idx, expected_state=self.options.expected_vm_status, only_status=only_status)
|
||||
else:
|
||||
self.check_vm_status(idx, only_status=only_status)
|
||||
elif self.options.mode == 'replication':
|
||||
self.check_replication()
|
||||
elif self.options.mode == 'ceph-health':
|
||||
self.check_ceph_health()
|
||||
elif self.options.mode == 'zfs-health':
|
||||
self.check_zfs_health(self.options.name)
|
||||
elif self.options.mode == 'zfs-fragmentation':
|
||||
self.check_zfs_fragmentation(self.options.name)
|
||||
else:
|
||||
message = "Check mode '{}' not known".format(self.options.mode)
|
||||
self.output(CheckState.UNKNOWN, message)
|
||||
|
||||
self.check_output()
|
||||
|
||||
def parse_args(self):
|
||||
p = argparse.ArgumentParser(description='Check command for PVE hosts via API')
|
||||
|
||||
api_opts = p.add_argument_group('API Options')
|
||||
|
||||
api_opts.add_argument("-e", "--api-endpoint", required=True, help="PVE api endpoint hostname")
|
||||
api_opts.add_argument("--api-port", required=False, help="PVE api endpoint port")
|
||||
|
||||
api_opts.add_argument("-u", "--username", dest='api_user', required=True,
|
||||
help="PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you "
|
||||
"have chosen in proxmox)")
|
||||
|
||||
group = api_opts.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("-p", "--password", dest='api_password', help="PVE API user password")
|
||||
group.add_argument("-t", "--api-token", dest="api_token", help="PVE API token (format: TOKEN_ID=TOKEN_SECRET")
|
||||
|
||||
api_opts.add_argument("-k", "--insecure", dest='api_insecure', action='store_true', default=False,
|
||||
help="Don't verify HTTPS certificate")
|
||||
|
||||
api_opts.set_defaults(api_port=8006)
|
||||
|
||||
check_opts = p.add_argument_group('Check Options')
|
||||
|
||||
check_opts.add_argument("-m", "--mode",
|
||||
choices=(
|
||||
'cluster', 'version', 'cpu', 'memory', 'swap', 'storage', 'io_wait', 'updates', 'services',
|
||||
'subscription', 'vm', 'vm_status', 'replication', 'disk-health', 'ceph-health',
|
||||
'zfs-health', 'zfs-fragmentation'),
|
||||
required=True,
|
||||
help="Mode to use.")
|
||||
|
||||
check_opts.add_argument('-n', '--node', dest='node',
|
||||
help='Node to check (necessary for all modes except cluster and version)')
|
||||
|
||||
check_opts.add_argument('--name', dest='name',
|
||||
help='Name of storage, vm, or container')
|
||||
|
||||
check_opts.add_argument('--vmid', dest='vmid', type=int,
|
||||
help='ID of virtual machine or container')
|
||||
|
||||
check_opts.add_argument('--expected-vm-status', choices=('running', 'stopped', 'paused'),
|
||||
help='Expected VM status')
|
||||
|
||||
check_opts.add_argument('--ignore-vm-status', dest='ignore_vm_status', action='store_true',
|
||||
help='Ignore VM status in checks',
|
||||
default=False)
|
||||
|
||||
check_opts.add_argument('--ignore-service', dest='ignore_services', action='append', metavar='NAME',
|
||||
help='Ignore service NAME in checks', default=[])
|
||||
|
||||
check_opts.add_argument('--ignore-disk', dest='ignore_disks', action='append', metavar='NAME',
|
||||
help='Ignore disk NAME in health check', default=[])
|
||||
|
||||
check_opts.add_argument('-w', '--warning', dest='threshold_warning', type=CheckThreshold.threshold_type,
|
||||
default={}, help='Warning threshold for check value. Mutiple thresholds with name:value,name:value')
|
||||
check_opts.add_argument('-c', '--critical', dest='threshold_critical', type=CheckThreshold.threshold_type,
|
||||
default={}, help='Critical threshold for check value. Mutiple thresholds with name:value,name:value')
|
||||
check_opts.add_argument('-M', dest='values_mb', action='store_true', default=False,
|
||||
help='Values are shown in the unit which is set with --unit (if available). Thresholds are also treated in this unit')
|
||||
check_opts.add_argument('-V', '--min-version', dest='min_version', type=str,
|
||||
help='The minimal pve version to check for. Any version lower than this will return '
|
||||
'CRITICAL.')
|
||||
|
||||
check_opts.add_argument('--unit', choices=self.UNIT_SCALE.keys(), default='MiB', help='Unit which is used for performance data and other values')
|
||||
|
||||
options = p.parse_args()
|
||||
|
||||
if not options.node and options.mode not in ['cluster', 'vm', 'vm_status', 'version', 'ceph-health']:
|
||||
p.print_usage()
|
||||
message = "{}: error: --mode {} requires node name (--node)".format(p.prog, options.mode)
|
||||
self.output(CheckState.UNKNOWN, message)
|
||||
|
||||
if not options.vmid and not options.name and options.mode in ('vm', 'vm_status'):
|
||||
p.print_usage()
|
||||
message = "{}: error: --mode {} requires either vm name (--name) or id (--vmid)".format(p.prog,
|
||||
options.mode)
|
||||
self.output(CheckState.UNKNOWN, message)
|
||||
|
||||
if not options.name and options.mode == 'storage':
|
||||
p.print_usage()
|
||||
message = "{}: error: --mode {} requires storage name (--name)".format(p.prog, options.mode)
|
||||
self.output(CheckState.UNKNOWN, message)
|
||||
|
||||
def compare_thresholds(threshold_warning, threshold_critical, comparator):
|
||||
ok = True
|
||||
keys = set(list(threshold_warning.keys()) + list(threshold_critical.keys()))
|
||||
for key in keys:
|
||||
if (key in threshold_warning and key in threshold_critical) or (None in threshold_warning and None in threshold_critical):
|
||||
ok = ok and comparator(threshold_warning[key], threshold_critical[key])
|
||||
elif key in threshold_warning and None in threshold_critical:
|
||||
ok = ok and comparator(threshold_warning[key], threshold_critical[None])
|
||||
elif key in threshold_critical and None in threshold_warning:
|
||||
ok = ok and comparator(threshold_warning[None], threshold_critical[key])
|
||||
|
||||
return ok
|
||||
|
||||
if options.threshold_warning and options.threshold_critical:
|
||||
if options.mode != 'subscription' and not compare_thresholds(options.threshold_warning, options.threshold_critical, lambda w,c: w<=c):
|
||||
p.error("Critical value must be greater than warning value")
|
||||
elif options.mode == 'subscription' and not compare_thresholds(options.threshold_warning, options.threshold_critical, lambda w,c: w>=c):
|
||||
p.error("Critical value must be lower than warning value")
|
||||
|
||||
self.options = options
|
||||
|
||||
def __init__(self):
|
||||
self.options = {}
|
||||
self.ticket = None
|
||||
self.perfdata = []
|
||||
self.check_result = CheckState.UNKNOWN
|
||||
self.check_message = ""
|
||||
|
||||
self.__headers = {}
|
||||
self.__cookies = {}
|
||||
|
||||
self.parse_args()
|
||||
|
||||
if self.options.api_insecure:
|
||||
# disable urllib3 warning about insecure requests
|
||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
if self.options.api_password is not None:
|
||||
self.__cookies['PVEAuthCookie'] = self.get_ticket()
|
||||
elif self.options.api_token is not None:
|
||||
self.__headers["Authorization"] = "PVEAPIToken={}!{}".format(self.options.api_user, self.options.api_token)
|
||||
|
||||
pve = CheckPVE()
|
||||
pve.check()
|
|
@ -0,0 +1,973 @@
|
|||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": 11,
|
||||
"links": [],
|
||||
"refresh": "30s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": "250px",
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "icinga2",
|
||||
"fill": 1,
|
||||
"id": 1,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"hideEmpty": false,
|
||||
"hideZero": false,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "CRITICAL",
|
||||
"color": "#BF1B00",
|
||||
"fill": 0,
|
||||
"legend": false
|
||||
},
|
||||
{
|
||||
"alias": "WARNING",
|
||||
"color": "#EAB839",
|
||||
"fill": 0,
|
||||
"legend": false
|
||||
},
|
||||
{
|
||||
"alias": "memory used",
|
||||
"color": "#0A437C",
|
||||
"yaxis": 2
|
||||
},
|
||||
{
|
||||
"alias": "memory used",
|
||||
"fill": 0
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"alias": "$service usage",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"value"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "usage"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"alias": "WARNING",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
|
||||
"rawQuery": false,
|
||||
"refId": "C",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"warn"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "usage"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"alias": "CRITICAL",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
|
||||
"rawQuery": false,
|
||||
"refId": "B",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"crit"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "usage"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$service usage",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percent",
|
||||
"label": "% usage",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": "used MB",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "icinga2",
|
||||
"fill": 1,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"hideEmpty": false,
|
||||
"hideZero": false,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "CRITICAL",
|
||||
"color": "#BF1B00",
|
||||
"fill": 0,
|
||||
"legend": false
|
||||
},
|
||||
{
|
||||
"alias": "WARNING",
|
||||
"color": "#EAB839",
|
||||
"fill": 0,
|
||||
"legend": false
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"alias": "$service used",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"value"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "used"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"alias": "WARNING",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
|
||||
"rawQuery": false,
|
||||
"refId": "C",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"warn"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "used"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"alias": "CRITICAL",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
|
||||
"rawQuery": false,
|
||||
"refId": "B",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"crit"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "used"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "$service used",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": "used",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": "used MB",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "icinga2",
|
||||
"fill": 1,
|
||||
"id": 3,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": true,
|
||||
"current": true,
|
||||
"hideEmpty": false,
|
||||
"hideZero": false,
|
||||
"max": true,
|
||||
"min": true,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [
|
||||
{
|
||||
"alias": "CRITICAL",
|
||||
"color": "#BF1B00",
|
||||
"fill": 0,
|
||||
"legend": false
|
||||
},
|
||||
{
|
||||
"alias": "WARNING",
|
||||
"color": "#EAB839",
|
||||
"fill": 0,
|
||||
"legend": false
|
||||
},
|
||||
{
|
||||
"alias": "memory used",
|
||||
"color": "#0A437C",
|
||||
"yaxis": 2
|
||||
},
|
||||
{
|
||||
"alias": "memory used",
|
||||
"fill": 0
|
||||
}
|
||||
],
|
||||
"spaceLength": 10,
|
||||
"span": 4,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"alias": "I/O wait",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"value"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "wait"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"alias": "WARNING",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
|
||||
"rawQuery": false,
|
||||
"refId": "C",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"warn"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "wait"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"alias": "CRITICAL",
|
||||
"dsType": "influxdb",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"metric"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"none"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"measurement": "pve",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
|
||||
"rawQuery": false,
|
||||
"refId": "B",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"crit"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": [
|
||||
{
|
||||
"key": "hostname",
|
||||
"operator": "=~",
|
||||
"value": "/^$hostname$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "service",
|
||||
"operator": "=~",
|
||||
"value": "/^$service$/"
|
||||
},
|
||||
{
|
||||
"condition": "AND",
|
||||
"key": "metric",
|
||||
"operator": "=",
|
||||
"value": "wait"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "I/O wait",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "percent",
|
||||
"label": "% usage",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "bytes",
|
||||
"label": "used MB",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "icmp checks",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "pve01.willi-graf.local",
|
||||
"value": "pve01.willi-graf.local"
|
||||
},
|
||||
"datasource": "icinga2",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "hostname",
|
||||
"options": [],
|
||||
"query": "SHOW TAG VALUES WITH KEY = \"hostname\"",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "io_wait",
|
||||
"value": "io_wait"
|
||||
},
|
||||
"datasource": "icinga2",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "service",
|
||||
"options": [],
|
||||
"query": "SHOW TAG VALUES WITH KEY = \"service\" where hostname =~ /^$hostname$/",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-2m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "icinga-pve-metrics",
|
||||
"version": 23
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
object CheckCommand "pve" {
|
||||
import "plugin-check-command"
|
||||
|
||||
command = [ PluginDir + "/check_pve.py" ]
|
||||
|
||||
arguments = {
|
||||
"-e" = {
|
||||
value = "$pve_host$"
|
||||
required = true
|
||||
description = "Hostname for PVE API"
|
||||
}
|
||||
"-u" = {
|
||||
value = "$pve_user$"
|
||||
required = true
|
||||
description = "API user (ex. monitoring@pve)"
|
||||
}
|
||||
"-p" = {
|
||||
value = "$pve_password$"
|
||||
required = true
|
||||
description = "API user password"
|
||||
}
|
||||
"-k" = {
|
||||
set_if = "$pve_insecure_connection$"
|
||||
description = "Connect to this host instead of $pve_host$"
|
||||
}
|
||||
"-m" = {
|
||||
value = "$pve_mode$"
|
||||
required = true
|
||||
description = "Check mode (cluster, version, updates, subscription, storage, cpu, memory, io_wait, vm, replication)"
|
||||
}
|
||||
"-n" = {
|
||||
value = "$pve_node$"
|
||||
description = "Node to check (necessary for all modes except cluster and version)"
|
||||
}
|
||||
"--name" = {
|
||||
value = "$pve_resource_name$"
|
||||
description = "Name of storage or vm to check"
|
||||
}
|
||||
"--expected-vm-status" = {
|
||||
value = "$pve_expected_vm_status$"
|
||||
description = "Expected status of the VM"
|
||||
}
|
||||
"--ignore-service" = {
|
||||
repeat_key = true
|
||||
value = "$pve_ignore_services$"
|
||||
description = "Ignore services in check"
|
||||
}
|
||||
"--ignore-disk" = {
|
||||
repeat_key = true
|
||||
value = "$pve_ignore_disks$"
|
||||
description = "Ignore disks in check"
|
||||
}
|
||||
"--ignore-vm-status" = {
|
||||
set_if = "$pve_ignore_vm_status$"
|
||||
description = "Ignore VM status in check"
|
||||
}
|
||||
"-w" = {
|
||||
value = "$pve_warning$"
|
||||
description = "Warning treshold"
|
||||
}
|
||||
"-c" = {
|
||||
value = "$pve_critical$"
|
||||
description = "Critical treshold"
|
||||
}
|
||||
"-M" = {
|
||||
set_if = "$pve_tresholds_mb$"
|
||||
description = "Unit of tresholds and values is MB"
|
||||
}
|
||||
"-V" = {
|
||||
value = "$pve_min_version$"
|
||||
description = "Minimal pve version. Everything lower than this will return CRITICAL."
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,139 @@
|
|||
template Host "proxmox-host" {
|
||||
import "generic-host"
|
||||
|
||||
vars.pve_host = name
|
||||
vars.pve_node = name.split(".")[0]
|
||||
// ... or if not matching the fqdn (nodename.domain.example)
|
||||
// vars.pve_node = "proxmox-host"
|
||||
|
||||
// if your icinga host don't trust your pve certificate, you'll have to uncomment this line
|
||||
// vars.pve_insecure_connection = true
|
||||
vars.pve_user = "monitor@pve"
|
||||
vars.pve_password = "SuperSecretPassw0rd"
|
||||
|
||||
// change to false, if node is no member of a pve cluster
|
||||
vars.pve_cluster = true
|
||||
}
|
||||
|
||||
object Host "proxmox-host.domain.example" {
|
||||
import "proxmox-host"
|
||||
|
||||
address = "192.168.42.42"
|
||||
|
||||
vars.pve_storage["flashpool"] = {
|
||||
pve_warning = 80
|
||||
pve_critical = 90
|
||||
}
|
||||
|
||||
vars.pve_storage["diskpool"] = {
|
||||
pve_warning = 80
|
||||
pve_critical = 90
|
||||
}
|
||||
|
||||
// Ignore these disks in health check (USB sticks, SD cards, etc.)
|
||||
vars.pve_ignore_disks = [ "sdn", "sdg" ]
|
||||
|
||||
vars.virtual_machines["vm-01"] = {
|
||||
}
|
||||
}
|
||||
|
||||
template Service "pve-service" {
|
||||
import "generic-service"
|
||||
|
||||
check_command = "pve"
|
||||
}
|
||||
|
||||
apply Service "cluster" {
|
||||
import "pve-service"
|
||||
|
||||
vars.pve_mode = "cluster"
|
||||
|
||||
assign where host.vars.pve_host && host.vars.pve_cluster
|
||||
}
|
||||
|
||||
apply Service "services" {
|
||||
import "pve-service"
|
||||
|
||||
vars.pve_mode = "services"
|
||||
|
||||
// Ignore cluster status on single nodes
|
||||
if (!host.vars.pve_cluster) {
|
||||
vars.pve_ignore_services = host.vars.pve_ignore_services || []
|
||||
vars.pve_ignore_services.add("corosync")
|
||||
}
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
||||
|
||||
apply Service "updates" {
|
||||
import "pve-service"
|
||||
|
||||
check_interval = 12h
|
||||
retry_interval = 2h
|
||||
max_check_attempts = 3
|
||||
|
||||
vars.pve_mode = "updates"
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
||||
|
||||
apply Service "disk-health" {
|
||||
import "pve-service"
|
||||
|
||||
vars.pve_mode = "disk-health"
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
||||
|
||||
apply Service "io_wait" {
|
||||
import "pve-service"
|
||||
|
||||
vars.pve_mode = "io_wait"
|
||||
|
||||
vars.pve_warning = 10
|
||||
vars.pve_critical = 30
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
||||
|
||||
apply Service "cpu" {
|
||||
import "pve-service"
|
||||
|
||||
vars.pve_mode = "cpu"
|
||||
|
||||
vars.pve_warning = 70
|
||||
vars.pve_critical = 90
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
||||
|
||||
apply Service "memory" {
|
||||
import "pve-service"
|
||||
|
||||
vars.pve_mode = "memory"
|
||||
|
||||
vars.pve_warning = 80
|
||||
vars.pve_critical = 90
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
||||
|
||||
apply Service "storage " for (storage => config in host.vars.pve_storage) {
|
||||
import "pve-service"
|
||||
|
||||
vars += config
|
||||
|
||||
vars.pve_mode = "storage"
|
||||
vars.pve_resource_name = storage
|
||||
}
|
||||
|
||||
apply Service "pve-vm " for (vm => config in host.vars.virtual_machines) {
|
||||
import "pve-service"
|
||||
|
||||
vars += config
|
||||
|
||||
vars.pve_mode = "vm"
|
||||
vars.pve_resource_name = vm
|
||||
|
||||
assign where host.vars.pve_host
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
requests
|
||||
argparse
|
||||
packaging
|
|
@ -9,7 +9,7 @@ import aiofiles.os
|
|||
import magic
|
||||
import markdown
|
||||
from PIL import Image
|
||||
from nio import AsyncClient, LoginResponse, RoomSendError, UploadResponse, MatrixRoom, RoomLeaveResponse, RoomForgetResponse
|
||||
from nio import AsyncClient, LoginResponse, MatrixRoom, RoomForgetResponse, RoomLeaveResponse, RoomSendError, UploadResponse
|
||||
|
||||
from . import nagios
|
||||
|
||||
|
@ -166,7 +166,6 @@ async def leave_all_rooms_async(client, exclude_starting_with=None):
|
|||
await client.sync()
|
||||
invited_rooms = copy.copy(client.invited_rooms) # RuntimeError: dictionary changed size during iteration
|
||||
for name, room in invited_rooms.items():
|
||||
print(room.room_id)
|
||||
# if exclude_starting_with and room.named_room_name() is not None and room.named_room_name().startswith(exclude_starting_with):
|
||||
# continue
|
||||
s, l, f = await leave_room_async(room.room_id, client)
|
||||
|
|
Loading…
Reference in New Issue