From d1665ef9d113194d9f8594d7c00d6ec6b9113ccb Mon Sep 17 00:00:00 2001 From: Cyberes Date: Fri, 21 Apr 2023 23:54:17 -0600 Subject: [PATCH] updates --- check_curl | 240 +++++ check_federation.py | 2 +- check_media_cdn.py | 12 +- check_monitor_bot.py | 6 +- check_nginx | 113 +++ check_pve/Dockerfile | 10 + check_pve/LICENSE | 339 +++++++ check_pve/README.md | 304 ++++++ check_pve/check_pve.py | 819 ++++++++++++++++ check_pve/grafana/pve-metrics-dashboard.json | 973 +++++++++++++++++++ check_pve/icinga2/command.conf | 74 ++ check_pve/icinga2/service.conf | 139 +++ check_pve/requirements.txt | 3 + checker/synapse_client.py | 3 +- 14 files changed, 3026 insertions(+), 11 deletions(-) create mode 100644 check_curl create mode 100644 check_nginx create mode 100644 check_pve/Dockerfile create mode 100644 check_pve/LICENSE create mode 100644 check_pve/README.md create mode 100644 check_pve/check_pve.py create mode 100644 check_pve/grafana/pve-metrics-dashboard.json create mode 100644 check_pve/icinga2/command.conf create mode 100644 check_pve/icinga2/service.conf create mode 100644 check_pve/requirements.txt diff --git a/check_curl b/check_curl new file mode 100644 index 0000000..862e96b --- /dev/null +++ b/check_curl @@ -0,0 +1,240 @@ +#!/bin/bash +# startup checks + +if [ -z "$BASH" ]; then + echo "Please use BASH." + exit 3 +fi +if [ ! -e "/usr/bin/which" ]; then + echo "/usr/bin/which is missing." + exit 3 +fi +curl=$(which curl) +if [ $? -ne 0 ]; then + echo "Please install curl." + exit 3 +fi + + +# Default Values +proxy="" +method="GET" +body="" +contains="" +lacks="" +insecure=0 +debug=0 +warning=700 +encodeurl=0 +critical=2000 +url="" +follow=0 +header="" +name="default" +cookies=0 + +# Usage Info +usage() { + echo '''Usage: check_curl [OPTIONS] + [OPTIONS]: + -U URL Target URL + -M METHOD HTTP Method (default: GET) + -N NAME Display Name of scanned object (default: default) + -B BODY Request Body to be sent (default: not sent) + -E ENCODEURL Send body defined with url encoding (curl --data-urlencode) (default: off) + -I INSECURE Sets the curl flag --insecure + -C CONTAINS If not contained in response body, CRITICAL will be returned + -L LACKS If contained in response body, CRITICAL will be returned (-C has priority when both are set) + -w WARNING Warning threshold in milliseconds (default: 700) + -c CRITICAL Critical threshold in milliseconds (default: 2000) + -H HEADER Send Header (i.E. "AUTHORIZATION: Bearer 8*.UdUYwrl!nK") + -F FOLLOW Follow redirects (default: OFF) + -D DEBUG Only prints the curl command (default: OFF) + -P PROXY Set Proxy Address (default: No Proxy) + -K COOKIES Enables/Disabled cookie handling in a temporary cookie jar''' +} + + +# Check which threshold was reached +checkTime() { + if [ $1 -gt $critical ]; then + echo -n "CRITICAL: Slow " + elif [ $1 -gt $warning ]; then + echo -n "WARNING: Slow " + else + echo -n "OK" + fi +} + +# Return code value +getStatus() { + if [ $1 -gt $critical ]; then + return 2 + elif [ $1 -gt $warning ]; then + return 1 + else + return 0 + fi +} + +#main +#get options +while getopts "P:M:B:C:w:c:U:H:IFN:O:EL:D:K" opt; do + case $opt in + K) + cookies=1 + ;; + P) + proxy=$OPTARG + ;; + M) + method=$OPTARG + ;; + B) + body=$OPTARG + ;; + C) + contains=$OPTARG + ;; + w) + warning=$OPTARG + ;; + c) + critical=$OPTARG + ;; + U) + url=$OPTARG + ;; + L) + lacks=$OPTARG + ;; + I) + insecure=1 + ;; + N) + name=$( echo $OPTARG | sed -e 's/[^A-Za-z0-9._-]/_/g' ) + ;; + E) + encodeurl=1 + ;; + H) + header=$OPTARG + ;; + F) + follow=1 + ;; + D) + debug=1 + ;; + *) + usage + exit 3 + ;; + esac +done + +#hostname is required +if [ -z "$url" ] || [ $# -eq 0 ]; then + echo "Error: URL is required" + usage + exit 3 +fi + +proxyarg="" +if [ ! -z $proxy ] ; then + proxyarg=" -x "$proxy" " +fi +headerarg="" +if [ ! -z "$header" ] ; then + headerarg=' -H "'$header'" ' +fi +followarg="" +if [ $follow -eq 1 ] ; then + followarg=" -L " +fi +insecurearg="" +if [ $insecure -eq 1 ] ; then + insecurearg=" --insecure " +fi +cookiesarg="" +if [ $cookies -eq 1 ] ; then + COOKIE_JAR_TEMP_PATH=$(mktemp /tmp/check_curl_cookiejar.XXXXXX) + cookiesarg=" -c ${COOKIE_JAR_TEMP_PATH} -b ${COOKIE_JAR_TEMP_PATH}" +fi +bodyarg="" +if [ ! -z $body ]; then + body=$(echo $body| sed "s/\"/\\\\\"/g") + bodyarg=" --data \""$body"\"" + if [ $encodeurl -eq 1 ]; then + bodyarg=" --data-urlencode \""$body"\"" + fi +fi + +if [ $debug -eq 1 ]; then + echo $curl --no-keepalive -s $insecurearg $proxyarg $followarg $bodyarg $headerarg -X $method $cookiesarg "$url" + exit 0 +else + start=$(echo $(($(date +%s%N)/1000000))) + body=$(eval $curl --no-keepalive -s $insecurearg $proxyarg $followarg $bodyarg $headerarg -X $method $cookiesarg "$url") + status=$? +fi + +if [ $cookies -eq 1 ] ; then + rm -f ${COOKIE_JAR_TEMP_PATH} +fi + +end=$(echo $(($(date +%s%N)/1000000))) +#decide output by return code +if [ $status -eq 0 ] ; then + if [ -n "$contains" ]; then + if [[ ! $body =~ $contains ]]; then + echo "CRITICAL: body does not contain '${contains}'. Body: '$(echo $body | sed 's/\(.\{50\}\).*/\1.../')' |time=$((end - start))ms;${warning};${critical};0;"$critical"ms" + exit 2 + fi + fi + if [ -n "$lacks" ]; then + if [[ $body == *$lacks* ]]; then + echo "CRITICAL: body contains '${lacks}'|time=$((end - start))ms;${warning};${critical};0;"$critical"ms" +exit 2 + fi + fi + echo "$(checkTime $((end - start))) $((end - start))ms - ${url}|time=$((end - start))ms;${warning};${critical};0;"$critical"ms" + getStatus $((end - start)) + exit $? +else + case $status in + 1) + echo "CRITICAL: Unsupported protocol" + ;; + 3) + echo "CRITICAL: Malformed URL" + ;; + 5) + echo "CRITICAL: Couldn't resolve proxy $proxy" + ;; + 6) + echo "CRITICAL: Couldn't resolve host" + ;; + 7) + echo "CRITICAL: Couldn't connect to proxy $proxy" + ;; + 22) + echo "CRITICAL: Server returned http code >= 400" + ;; + 52) + echo "CRITICAL: Server returned empty response (52)" + ;; + 56) + echo "CRITICAL: Failure recieving network data (56)" + ;; + 60) + echo "CRITICAL: SSL/TLS connection problem (60)" + ;; + *) + echo "UNKNOWN: $status - ${url}" + exit 3 + ;; + esac + exit 2 +fi + diff --git a/check_federation.py b/check_federation.py index 5dc0ddb..3087c3b 100644 --- a/check_federation.py +++ b/check_federation.py @@ -244,7 +244,7 @@ async def main() -> None: for x in prints: print(f'\n{x}', end=' ') - print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot2_output_msg}s;;;") + print(f"|'{bot1_hs_domain}_outbound'={bot1_output_msg}s;;; '{bot1_hs_domain}_inbound'={bot1_output_msg}s;;;") sys.exit(nagios_output) diff --git a/check_media_cdn.py b/check_media_cdn.py index 39147b4..73e51eb 100644 --- a/check_media_cdn.py +++ b/check_media_cdn.py @@ -179,7 +179,7 @@ async def main() -> None: exit_code = nagios.CRITICAL prints.append(f"CRITICAL: recieved 301 to {urllib.parse.urlparse(headers['location']).netloc}") else: - prints.append(f'OK: is not redirected.') + prints.append(f'OK: was not redirected.') if args.required_headers: # Icinga may pass the values as one string @@ -192,11 +192,11 @@ async def main() -> None: if code > exit_code: exit_code = code - results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')] - for header_chk, code in results: - prints.append(header_chk) - if code > exit_code: - exit_code = code + # results = [verify_media_header('synapse-media-local-status', headers), verify_media_header('synapse-media-s3-status', headers, good_value='200'), verify_media_header('synapse-media-server', headers, good_value='s3')] + # for header_chk, code in results: + # prints.append(header_chk) + # if code > exit_code: + # exit_code = code clean_msg = await cleanup(client, test_image_path, image_event_id=image_event_id) diff --git a/check_monitor_bot.py b/check_monitor_bot.py index 95a3e17..7b483e8 100644 --- a/check_monitor_bot.py +++ b/check_monitor_bot.py @@ -54,9 +54,11 @@ def main(): m = re.match(r'\s*Send: (.*?)\s*\s*Receive: (.*?)\s*<\/span>', str(item)) if m: domain = item.parent.parent.find('span', {'class': 'domain'}).text + s = ms_to_s(m.group(1)) + r = ms_to_s(m.group(2)) data[domain] = { - 'send': ms_to_s(m.group(1)), - 'receive': ms_to_s(m.group(2)), + 'send': (s if s else -1), + 'receive': (r if r else -1), } exit_code = nagios.OK info_str = [] diff --git a/check_nginx b/check_nginx new file mode 100644 index 0000000..295e644 --- /dev/null +++ b/check_nginx @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# check_nginx is a Nagios to monitor nginx status +# The version is 1.0.2 +# fixed by Nikolay Kandalintsev (twitter: @nicloay) +# Based on yangzi2008@126.com from http://www.nginxs.com +# which available here http://exchange.nagios.org/directory/Plugins/Web-Servers/nginx/check_nginx/details + +import getopt +import string +import sys +import traceback + +import urllib +from urllib.request import urlopen + + +def usage(): + print("""check_nginx is a Nagios to monitor nginx status + Usage: + + check_nginx [-h|--help][-U|--url][-P|--path][-u|--user][-p|--passwd][-w|--warning][-c|--critical] + + Options: + --help|-h) + print check_nginx help. + --url|-U) + Sets nginx status url. + --path|-P) + Sets nginx status url path. Default is: off + --user|-u) + Sets nginx status BasicAuth user. Default is: off + --passwd|-p) + Sets nginx status BasicAuth passwd. Default is: off + --warning|-w) + Sets a warning level for nginx Active connections. Default is: off + --critical|-c) + Sets a critical level for nginx Active connections. Default is: off + Example: + The url is www.nginxs.com/status + ./check_nginx -U www.nginxs.com -P /status -u eric -p nginx -w 1000 -c 2000 + if dont't have password: + ./check_nginx -U www.nginxs.com -P /status -w 1000 -c 2000 + if don't have path and password: + ./check_nginx -U www.nginxs.com -w 1000 -c 2000""") + + sys.exit(3) + + +try: + options, args = getopt.getopt(sys.argv[1:], "hU:P:u:p:w:c:", ["help", "url=", "path=", "user=", "passwd=", "warning=", "critical="]) + +except getopt.GetoptError: + usage() + sys.exit(3) + +for name, value in options: + if name in ("-h", "--help"): + usage() + if name in ("-U", "--url"): + url = "http://" + value + if name in ("-P", "--path"): + path = value + if name in ("-u", "--user"): + user = value + if name in ("-p", "--passwd"): + passwd = value + if name in ("-w", "--warning"): + warning = value + if name in ("-c", "--critical"): + critical = value +try: + if 'path' in dir(): + req = urllib.Request(url + path) + else: + req = urllib.Request(url) + if 'user' in dir() and 'passwd' in dir(): + passman = urllib.HTTPPasswordMgrWithDefaultRealm() + passman.add_password(None, url + path, user, passwd) + authhandler = urllib.HTTPBasicAuthHandler(passman) + opener = urllib.build_opener(authhandler) + urllib.install_opener(opener) + response = urlopen(req) + the_page = response.readline() + conn = the_page.split() + ActiveConn = conn[2] + the_page1 = response.readline() + the_page2 = response.readline() + the_page3 = response.readline() + response.close() + b = the_page3.split() + reading = b[1] + writing = b[3] + waiting = b[5] + output = 'ActiveConn:%s,reading:%s,writing:%s,waiting:%s' % (ActiveConn, reading, writing, waiting) + perfdata = 'ActiveConn=%s;reading=%s;writing=%s;waiting=%s' % (ActiveConn, reading, writing, waiting) + +except Exception: + print("NGINX STATUS unknown: Error while getting Connection") + print(traceback.format_exc()) + sys.exit(3) +if 'warning' in dir() and 'critical' in dir(): + if int(ActiveConn) >= int(critical): + print('CRITICAL - %s|%s' % (output, perfdata)) + sys.exit(2) + elif int(ActiveConn) >= int(warning): + print('WARNING - %s|%s' % (output, perfdata)) + sys.exit(1) + else: + print('OK - %s|%s' % (output, perfdata)) + sys.exit(0) +else: + print('OK - %s|%s' % (output, perfdata)) + sys.exit(0) diff --git a/check_pve/Dockerfile b/check_pve/Dockerfile new file mode 100644 index 0000000..f3e6b7c --- /dev/null +++ b/check_pve/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3 + +ADD check_pve.py / +ADD requirements.txt / +RUN apt-get update +RUN apt install -y python3 python3-requests python3-packaging +RUN pip3 install -r requirements.txt + + +CMD ["tail", "-f", "/dev/null"] diff --git a/check_pve/LICENSE b/check_pve/LICENSE new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/check_pve/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/check_pve/README.md b/check_pve/README.md new file mode 100644 index 0000000..9f034b3 --- /dev/null +++ b/check_pve/README.md @@ -0,0 +1,304 @@ +# check_pve +Icinga check command for Proxmox VE via API + +## Setup + +### Requirements + +This check command depends on **Python 3** and the following modules: + * requests + * argparse + * packaging + +**Installation on Debian / Ubuntu** +``` +apt install python3 python3-requests python3-packaging +``` + +**Installation on Redhat 7 / CentOS 7** +``` +yum install python36 python36-requests python36-packaging +``` + +**Installation on FreeBSD** +``` +pkg install python3 py39-requests py39-packaging +``` + +**Installation from requirements file** +``` +pip3 install -r requirements.txt +``` + +**Installation as Docker container** +``` +docker build -t check_pve . +``` +After this, you can start the container like so: +``` +docker run -d --name check_pve --rm check_pve +``` +The container will keep running without having the need for any of the requirements listed above (for environments that do not support this). +Running a check is as simple as: +``` +docker exec check_pve python check_pve.py ....rest of the default arguments listed below.... +``` + +### Create a API user in Proxmox VE + +Create a role named ``Monitoring`` and assign necessary privileges: + +``` +pveum roleadd Monitoring +pveum rolemod Monitoring --privs VM.Monitor,Sys.Audit,Datastore.Audit,VM.Audit +``` + +Create a user named ``monitoring`` and set password: + +``` +pveum useradd monitoring@pve --comment "The ICINGA 2 monitoring user" +``` + +#### Use token based authorization (recommended) + +Create an API token named `monitoring` for the user `monitoring`: + +``` +pveum user token add monitoring@pve monitoring +``` + +Please save the token secret as there isn't any way to fetch it at a later point. + +Assign role `monitoring` to token `monitoring` and the user `monitoring@pve`: + +``` +pveum acl modify / --roles Monitoring --user 'monitoring@pve' +pveum acl modify / --roles Monitoring --tokens 'monitoring@pve!monitoring' +``` + + +#### Use password based authorization + +Set password for the user `monitoring`: + +``` +pveum passwd monitoring@pve +``` + +Assign ``monitoring`` role to user ``monitoring`` + +``` +pveum acl modify / --users monitoring@pve --roles Monitoring +``` + +For further information about the Proxmox VE privilege system have a look into the [documentation](https://pve.proxmox.com/pve-docs/pve-admin-guide.html#_strong_pveum_strong_proxmox_ve_user_manager). + + +## Usage + +The ``icinga2`` folder contains the command definition and service examples for use with Icinga2. + +``` +usage: check_pve.py [-h] -e API_ENDPOINT [--api-port API_PORT] -u API_USER (-p API_PASSWORD | -t API_TOKEN) [-k] -m + {cluster,version,cpu,memory,swap,storage,io_wait,updates,services,subscription,vm,vm_status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation} [-n NODE] [--name NAME] [--vmid VMID] + [--expected-vm-status {running,stopped,paused}] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] [--unit {GB,MB,KB,GiB,MiB,KiB,B}] + +Check command for PVE hosts via API + +options: + -h, --help show this help message and exit + +API Options: + -e API_ENDPOINT, --api-endpoint API_ENDPOINT + PVE api endpoint hostname + --api-port API_PORT PVE api endpoint port + -u API_USER, --username API_USER + PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you have chosen in proxmox) + -p API_PASSWORD, --password API_PASSWORD + PVE API user password + -t API_TOKEN, --api-token API_TOKEN + PVE API token (format: TOKEN_ID=TOKEN_SECRET + -k, --insecure Don't verify HTTPS certificate + +Check Options: + -m {cluster,version,cpu,memory,swap,storage,io_wait,updates,services,subscription,vm,vm_status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation}, --mode {cluster,version,cpu,memory,swap,storage,io_wait,updates,services,subscription,vm,vm_status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation} + Mode to use. + -n NODE, --node NODE Node to check (necessary for all modes except cluster and version) + --name NAME Name of storage, vm, or container + --vmid VMID ID of virtual machine or container + --expected-vm-status {running,stopped,paused} + Expected VM status + --ignore-vm-status Ignore VM status in checks + --ignore-service NAME + Ignore service NAME in checks + --ignore-disk NAME Ignore disk NAME in health check + -w THRESHOLD_WARNING, --warning THRESHOLD_WARNING + Warning threshold for check value. Mutiple thresholds with name:value,name:value + -c THRESHOLD_CRITICAL, --critical THRESHOLD_CRITICAL + Critical threshold for check value Mutiple thresholds with name:value,name:value + -M Values are shown in the unit which is set with --unit (if available). Thresholds are also treated in this unit + -V MIN_VERSION, --min-version MIN_VERSION + The minimal pve version to check for. Any version lower than this will return CRITICAL. + --unit {GB,MB,KB,GiB,MiB,KiB,B} + Unit which is used for performance data and other values + + +``` + +## Check examples + + +**Check cluster health** +``` +./check_pve.py -u -t -e -m cluster +OK - Cluster 'proxmox1' is healthy' +``` + +**Check PVE version** +``` +./check_pve.py -u -p -e -m version -V 5.0.0 +OK - Your pve instance version '5.2' (0fcd7879) is up to date +``` + +**Check CPU load** +``` +./check_pve.py -u -p -e -m cpu -n node1 +OK - CPU usage is 2.4%|usage=2.4%;; +``` + +**Check memory usage** +``` +./check_pve.py -u -p -e -m memory -n node1 +OK - Memory usage is 37.44%|usage=37.44%;; used=96544.72MB;;;257867.91 +``` + +**Check disk-health** +``` +./check_pve.py -u -p -e -m disk-health -n node1 +OK - All disks are healthy|wearout_sdb=96%;; wearout_sdc=96%;; wearout_sdd=96%;; wearout_sde=96%;; +``` + +**Check storage usage** +``` +./check_pve.py -u -p -e -m storage -n node1 --name local +OK - Storage usage is 54.23%|usage=54.23%;; used=128513.11MB;;;236980.36 + +./check_pve.py -u -p -e -m storage -n node1 --name vms-disx +CRITICAL - Storage 'vms-disx' doesn't exist on node 'node01' +``` + +**Check subscription status** +``` +./check_pve.py -u -p -e -m subscription -n node1 -w 50 -c 10 +OK - Subscription of level 'Community' is valid until 2019-01-09 +``` + +**Check VM status** + +Without specifying a node name: +``` +./check_pve.py -u -p -e -m vm --name test-vm +OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=8.33%;; +``` + +You can also pass a container name for the VM check: +``` +./check_pve.py -u -p -e -m vm --name test-lxc +OK - LXC 'test-lxc' on node 'node1' is running|cpu=0.11%;; memory=13.99%;; +``` + +With memory thresholds: +``` +./check_pve.py -u -p -e -m vm --name test-vm -w 50 -c 80 +OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=40.33%;50.0;80.0 +``` + +With a specified node name, the check plugin verifies on which node the VM runs. +``` +./check_pve.py -u -p -e -m vm -n node1 --name test-vm +OK - VM 'test-vm' is running on node 'node1'|cpu=1.85%;; memory=8.33%;; + +./check_pve.py -u -p -e -m vm -n node1 --name test-vm +WARNING - VM 'test-vm' is running on node 'node2' instead of 'node1'|cpu=1.85%;; memory=8.33%;; +``` + +If you only want to gather metrics and don't care about the vm status add the ``--ignore-vm-status`` flag: +``` +./check_pve.py -u -p -e -m vm --name test-vm --ignore-vm-status +OK - VM 'test-vm' is not running +``` + +Specify the expected VM status: +``` +./check_pve.py -u -p -e -m vm --name test-vm --expected-vm-status stopped +OK - VM 'test-vm' is not running + +``` + +For hostalive checks without gathering performance data use ``vm_status`` instead of ``vm``. The parameters are the same as with ``vm``. + +**Check swap usage** +``` +./check_pve.py -u -p -e -m swap -n pve +OK - Swap usage is 0.0 %|usage=0.0%;; used=0.0MB;;;8192.0 +``` + +**Check storage replication status** +``` +./check_pve.py -u -p -e -m replication -n node1 +OK - No failed replication jobs on node1 +``` + +**Check ceph cluster health** +``` +./check_pve.py -u -p -e -m ceph-health +WARNING - Ceph Cluster is in warning state +``` + +**Check ZFS pool health** +``` +./check_pve.py -u -p -e -m zfs-health -n pve +OK - All ZFS pools are healthy +``` + +Check for specific pool: +``` +./check_pve.py -u -p -e -m zfs-health -n pve --name rpool +OK - ZFS pool 'rpool' is healthy +``` + +**Check ZFS pool fragmentation** +``` +./check_pve.py -u -p -e -m zfs-fragmentation -n pve -w 40 -c 60 +CRITICAL - 2 of 2 ZFS pools are above fragmentation thresholds: + +- rpool (71 %) is CRITICAL +- diskpool (50 %) is WARNING +|fragmentation_diskpool=50%;40.0;60.0 fragmentation_rpool=71%;40.0;60.0 + +``` + +Check for specific pool: +``` +./check_pve.py -u -p -e -m zfs-fragmentation -n pve --name diskpool -w 40 -c 60 +WARNING - Fragmentation of ZFS pool 'diskpool' is above thresholds: 50 %|fragmentation=50%;40.0;60.0 +``` + +## FAQ + +### Individual thresholds per metric + +You can either specify a threshold for warning or critical which is applied to all metrics or define individual thresholds like this (`name:value,name:value,...`): + +``` +./check_pve.py -u -p -e -m vm --name test-vm -w memory:50 -c cpu:50,memory:80 +OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;50.0; memory=40.33%;50.0;80.0 +``` + +### Could not connect to PVE API: Failed to resolve hostname + +Verify that your DNS server is working and can resolve your hostname. If everything is fine check for proxyserver environment variables (HTTP_PROXY,HTTPS_PROXY), which maybe not allow communication to port 8006. + +## Contributors + +Thank you to everyone, who is contributing to `check_pve`: https://github.com/nbuchwitz/check_pve/graphs/contributors. diff --git a/check_pve/check_pve.py b/check_pve/check_pve.py new file mode 100644 index 0000000..bf8ce6e --- /dev/null +++ b/check_pve/check_pve.py @@ -0,0 +1,819 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# ------------------------------------------------------------------------------ +# check_pve.py - A check plugin for Proxmox Virtual Environment (PVE). +# Copyright (C) 2018-2022 Nicolai Buchwitz +# +# Version: 1.2.2 +# +# ------------------------------------------------------------------------------ +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# ------------------------------------------------------------------------------ + +import sys +import re + +try: + from enum import Enum + from datetime import datetime + from packaging import version + import argparse + import requests + +except ImportError as e: + print("Missing python module: {}".format(str(e))) + sys.exit(255) + + +class CheckState(Enum): + OK = 0 + WARNING = 1 + CRITICAL = 2 + UNKNOWN = 3 + + +class CheckThreshold: + def __init__(self, value: float): + self.value = value + + def __eq__(self, other): + return self.value == other.value + + def __lt__(self, other): + return self.value < other.value + + def __le__(self, other): + return self.value <= other.value + + def __gt__(self, other): + return self.value > other.value + + def __ge__(self, other): + return self.value >= other.value + + def check(self, value: float, lower: bool = False): + if lower: + return value < self.value + else: + return value > self.value + + @staticmethod + def threshold_type(arg: str): + thresholds = {} + + try: + thresholds[None] = CheckThreshold(float(arg)) + except: + for t in arg.split(','): + m = re.match("([a-z_0-9]+):([0-9.]+)", t) + + if m: + thresholds[m.group(1)] = CheckThreshold(float(m.group(2))) + else: + raise argparse.ArgumentTypeError( + "invalid threshold format: {}".format(t)) + + return thresholds + + +class CheckPVE: + VERSION = '1.2.2' + API_URL = 'https://{hostname}:{port}/api2/json/{command}' + UNIT_SCALE = { + "GB": 10**9, + "MB": 10**6, + "KB": 10**3, + "GiB": 2**30, + "MiB": 2**20, + "KiB": 2**10, + "B": 1 + } + + def check_output(self): + message = self.check_message + if self.perfdata: + message += self.get_perfdata() + + self.output(self.check_result, message) + + @staticmethod + def output(rc, message): + prefix = rc.name + message = '{} - {}'.format(prefix, message) + + print(message) + sys.exit(rc.value) + + def get_url(self, command): + return self.API_URL.format(hostname=self.options.api_endpoint, command=command, port=self.options.api_port) + + def request(self, url, method='get', **kwargs): + response = None + try: + if method == 'post': + response = requests.post( + url, + verify=not self.options.api_insecure, + data=kwargs.get('data', None), + timeout=5 + ) + elif method == 'get': + response = requests.get( + url, + verify=not self.options.api_insecure, + cookies=self.__cookies, + headers=self.__headers, + params=kwargs.get('params', None), + ) + else: + self.output(CheckState.CRITICAL, "Unsupport request method: {}".format(method)) + except requests.exceptions.ConnectTimeout: + self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Connection timeout") + except requests.exceptions.SSLError: + self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Certificate validation failed") + except requests.exceptions.ConnectionError: + self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Failed to resolve hostname") + + if response.ok: + return response.json()['data'] + else: + message = "Could not fetch data from API: " + + if response.status_code == 401: + message += "Could not connection to PVE API: invalid username or password" + elif response.status_code == 403: + message += "Access denied. Please check if API user has sufficient permissions / the role has been " \ + "assigned." + else: + message += "HTTP error code was {}".format(response.status_code) + + self.output(CheckState.UNKNOWN, message) + + def get_ticket(self): + url = self.get_url('access/ticket') + data = {"username": self.options.api_user, "password": self.options.api_password} + result = self.request(url, "post", data=data) + + return result['ticket'] + + def check_api_value(self, url, message, **kwargs): + result = self.request(url) + used = None + + if 'key' in kwargs: + result = result[kwargs.get('key')] + + if isinstance(result, (dict,)): + used_percent = self.get_value(result['used'], result['total']) + used = self.get_value(result['used']) + total = self.get_value(result['total']) + + self.add_perfdata(kwargs.get('perfkey', 'usage'), used_percent) + self.add_perfdata(kwargs.get('perfkey', 'used'), used, max=total, unit=self.options.unit) + else: + used_percent = round(float(result) * 100, 2) + self.add_perfdata(kwargs.get('perfkey', 'usage'), used_percent) + + if self.options.values_mb: + message += ' {} {}'.format(used, self.options.unit) + value = used + else: + message += ' {} {}'.format(used_percent, '%') + value = used_percent + + self.check_thresholds(value, message) + + def check_vm_status(self, idx, **kwargs): + url = self.get_url('cluster/resources', ) + data = self.request(url, params={'type': 'vm'}) + + expected_state = kwargs.get("expected_state", "running") + only_status = kwargs.get("only_status", False) + + found = False + for vm in data: + if vm['name'] == idx or vm['vmid'] == idx: + # Check if VM (default) or LXC + vm_type = "VM" + if vm['type'] == 'lxc': + vm_type = "LXC" + + if vm['status'] != expected_state: + self.check_message = "{} '{}' is {} (expected: {})".format(vm_type, vm['name'], vm['status'], + expected_state) + if not self.options.ignore_vm_status: + self.check_result = CheckState.CRITICAL + else: + if self.options.node and self.options.node != vm['node']: + self.check_message = "{} '{}' is {}, but located on node '{}' instead of '{}'" \ + .format(vm_type, vm['name'], expected_state, vm['node'], self.options.node) + self.check_result = CheckState.WARNING + else: + self.check_message = "{} '{}' is {} on node '{}'" \ + .format(vm_type, vm['name'], expected_state, vm['node']) + + if vm['status'] == 'running' and not only_status: + cpu = round(vm['cpu'] * 100, 2) + self.add_perfdata("cpu", cpu) + + if self.options.values_mb: + memory = self.scale_value(vm['mem']) + self.add_perfdata("memory", memory, unit=self.options.unit, max=self.scale_value(vm['maxmem'])) + + else: + memory = self.get_value(vm['mem'], vm['maxmem']) + self.add_perfdata("memory", memory) + + self.check_thresholds({"cpu": cpu, "memory": memory}, message=self.check_message) + + found = True + break + + if not found: + self.check_message = "VM or LXC '{}' not found".format(idx) + self.check_result = CheckState.WARNING + + def check_disks(self): + url = self.get_url('nodes/{}/disks'.format(self.options.node)) + + failed = [] + unknown = [] + disks = self.request(url + '/list') + for disk in disks: + name = disk['devpath'].replace('/dev/', '') + + if name in self.options.ignore_disks: + continue + + if disk['health'] == 'UNKNOWN': + self.check_result = CheckState.WARNING + unknown.append({"serial": disk["serial"], "device": disk['devpath']}) + + elif disk['health'] not in ('PASSED', 'OK'): + self.check_result = CheckState.WARNING + failed.append({"serial": disk["serial"], "device": disk['devpath']}) + + if disk['wearout'] != 'N/A': + self.add_perfdata('wearout_{}'.format(name), disk['wearout']) + + if failed: + self.check_message = "{} of {} disks failed the health test:\n".format(len(failed), len(disks)) + for disk in failed: + self.check_message += "- {} with serial '{}'\n".format(disk['device'], disk['serial']) + + if unknown: + self.check_message += "{} of {} disks have unknown health status:\n".format(len(unknown), len(disks)) + for disk in unknown: + self.check_message += "- {} with serial '{}'\n".format(disk['device'], disk['serial']) + + if not failed and not unknown: + self.check_message = "All disks are healthy" + + def check_replication(self): + url = self.get_url('nodes/{}/replication'.format(self.options.node)) + + if self.options.vmid: + data = self.request(url, params={'guest': self.options.vmid}) + else: + data = self.request(url) + + failed_jobs = [] # format: [{guest: str, fail_count: int, error: str}] + performance_data = [] + + for job in data: + if job['fail_count'] > 0: + failed_jobs.append({'guest': job['guest'], 'fail_count': job['fail_count'], 'error': job['error']}) + else: + performance_data.append({'id': job['id'], 'duration': job['duration']}) + + if len(failed_jobs) > 0: + message = "Failed replication jobs on {}: ".format(self.options.node) + for job in failed_jobs: + message = message + "GUEST: {j[guest]}, FAIL_COUNT: {j[fail_count]}, ERROR: {j[error]} ; ".format(j=job) + self.check_message = message + self.check_result = CheckState.WARNING + else: + self.check_message = "No failed replication jobs on {}".format(self.options.node) + self.check_result = CheckState.OK + + if len(performance_data) > 0: + for metric in performance_data: + self.add_perfdata('duration_' + metric['id'], metric['duration'], unit='s') + + def check_services(self): + url = self.get_url('nodes/{}/services'.format(self.options.node)) + data = self.request(url) + + failed = {} + for service in data: + if service['state'] != 'running' \ + and service.get('active-state', 'active') == 'active' \ + and service['name'] not in self.options.ignore_services: + failed[service['name']] = service['desc'] + + if failed: + self.check_result = CheckState.CRITICAL + message = "{} services are not running:\n\n".format(len(failed)) + message += "\n".join(['- {} ({}) is not running'.format(failed[i], i) for i in failed]) + self.check_message = message + else: + self.check_message = "All services are running" + + def check_subscription(self): + url = self.get_url('nodes/{}/subscription'.format(self.options.node)) + data = self.request(url) + + if data['status'] == 'NotFound': + self.check_result = CheckState.WARNING + self.check_message = "No valid subscription found" + if data['status'] == 'Inactive': + self.check_result = CheckState.CRITICAL + self.check_message = "Subscription expired" + elif data['status'] == 'Active': + subscription_due_date = data['nextduedate'] + subscription_product_name = data['productname'] + + date_expire = datetime.strptime(subscription_due_date, '%Y-%m-%d') + date_today = datetime.today() + delta = (date_expire - date_today).days + + message = '{} is valid until {}'.format( + subscription_product_name, + subscription_due_date) + message_warning_critical = '{} will expire in {} days ({})'.format( + subscription_product_name, + delta, + subscription_due_date) + + self.check_thresholds(delta, message, messageWarning=message_warning_critical, + messageCritical=message_warning_critical, lowerValue=True) + + def check_updates(self): + url = self.get_url('nodes/{}/apt/update'.format(self.options.node)) + count = len(self.request(url)) + + if count: + self.check_result = CheckState.WARNING + msg = "{} pending update" + if count > 1: + msg += "s" + self.check_message = msg.format(count) + else: + self.check_message = "System up to date" + + def check_cluster_status(self): + url = self.get_url('cluster/status') + data = self.request(url) + + nodes = {} + quorate = None + cluster = '' + for elem in data: + if elem['type'] == 'cluster': + quorate = elem['quorate'] + cluster = elem['name'] + elif elem['type'] == 'node': + nodes[elem['name']] = elem['online'] + + if quorate is None: + self.check_message = 'No cluster configuration found' + elif quorate: + node_count = len(nodes) + nodes_online_count = len({k: v for k, v in nodes.items() if v}) + + if node_count > nodes_online_count: + diff = node_count - nodes_online_count + self.check_result = CheckState.WARNING + self.check_message = "Cluster '{}' is healthy, but {} node(s) offline'".format(cluster, diff) + else: + self.check_message = "Cluster '{}' is healthy'".format(cluster) + + self.add_perfdata('nodes_total', node_count, unit='') + self.add_perfdata('nodes_online', nodes_online_count, unit='') + else: + self.check_result = CheckState.CRITICAL + self.check_message = 'Cluster is unhealthy - no quorum' + + def check_zfs_fragmentation(self, name=None): + url = self.get_url('nodes/{}/disks/zfs'.format(self.options.node)) + data = self.request(url) + + warnings = [] + critical = [] + found = name is None + for pool in data: + found = found or name == pool['name'] + if (name is not None and name == pool['name']) or name is None: + key = "fragmentation" + if name is None: + key += '_{}'.format(pool['name']) + self.add_perfdata(key, pool['frag']) + + threshold_name = "fragmentation_{}".format(pool['name']) + threshold_warning = self.threshold_warning(threshold_name) + threshold_critical = self.threshold_critical(threshold_name) + + if threshold_critical is not None and pool['frag'] > float( + threshold_critical.value): + critical.append(pool) + elif threshold_warning is not None and pool['frag'] > float( + threshold_warning.value): + warnings.append(pool) + + if not found: + self.check_result = CheckState.UNKNOWN + self.check_message = "Could not fetch fragmentation of ZFS pool '{}'".format(name) + else: + if warnings or critical: + value = None + if critical: + self.check_result = CheckState.CRITICAL + if name is not None: + value = critical[0]['frag'] + else: + self.check_result = CheckState.WARNING + if name is not None: + value = warnings[0]['frag'] + + if name is not None: + self.check_message = "Fragmentation of ZFS pool '{}' is above thresholds: {} %".format(name, value) + else: + message = "{} of {} ZFS pools are above fragmentation thresholds:\n\n".format( + len(warnings) + len(critical), len(data)) + message += "\n".join( + ['- {} ({} %) is CRITICAL\n'.format(pool['name'], pool['frag']) for pool in critical]) + message += "\n".join( + ['- {} ({} %) is WARNING\n'.format(pool['name'], pool['frag']) for pool in warnings]) + self.check_message = message + else: + self.check_result = CheckState.OK + if name is not None: + self.check_message = "Fragmentation of ZFS pool '{}' is OK".format(name) + else: + self.check_message = "Fragmentation of all ZFS pools is OK" + + def check_zfs_health(self, name=None): + url = self.get_url('nodes/{}/disks/zfs'.format(self.options.node)) + data = self.request(url) + + unhealthy = [] + found = name is None + healthy_conditions = ['online'] + for pool in data: + found = found or name == pool['name'] + if (name is not None and name == pool['name']) or name is None: + if pool['health'].lower() not in healthy_conditions: + unhealthy.append(pool) + + if not found: + self.check_result = CheckState.UNKNOWN + self.check_message = "Could not fetch health of ZFS pool '{}'".format(name) + else: + if unhealthy: + self.check_result = CheckState.CRITICAL + message = "{} ZFS pools are not healthy:\n\n".format(len(unhealthy)) + message += "\n".join( + ['- {} ({}) is not healthy'.format(pool['name'], pool['health']) for pool in unhealthy]) + self.check_message = message + else: + self.check_result = CheckState.OK + if name is not None: + self.check_message = "ZFS pool '{}' is healthy".format(name) + else: + self.check_message = "All ZFS pools are healthy" + + def check_ceph_health(self): + url = self.get_url('cluster/ceph/status') + data = self.request(url) + ceph_health = data.get('health', {}) + + if 'status' not in ceph_health: + self.check_result = CheckState.UNKNOWN + self.check_message = "Could not fetch Ceph status from API. " \ + "Check the output of 'pvesh get cluster/ceph' on your node" + return + + if ceph_health['status'] == 'HEALTH_OK': + self.check_result = CheckState.OK + self.check_message = "Ceph Cluster is healthy" + elif ceph_health['status'] == 'HEALTH_WARN': + self.check_result = CheckState.WARNING + self.check_message = "Ceph Cluster is in warning state" + elif ceph_health['status'] == 'HEALTH_CRIT': + self.check_result = CheckState.CRITICAL + self.check_message = "Ceph Cluster is in critical state" + else: + self.check_result = CheckState.UNKNOWN + self.check_message = "Ceph Cluster is in unknown state" + + def check_storage(self, name): + # check if storage exists + url = self.get_url('nodes/{}/storage'.format(self.options.node)) + data = self.request(url) + + if not any(s['storage'] == name for s in data): + self.check_result = CheckState.CRITICAL + self.check_message = "Storage '{}' doesn't exist on node '{}'".format(name, self.options.node) + return + + url = self.get_url('nodes/{}/storage/{}/status'.format(self.options.node, name)) + self.check_api_value(url, "Usage of storage '{}' is".format(name)) + + def check_version(self): + url = self.get_url('version') + data = self.request(url) + if not data['version']: + self.check_result = CheckState.UNKNOWN + self.check_message = "Unable to determine pve version" + elif self.options.min_version and version.parse(self.options.min_version) > version.parse(data['version']): + self.check_result = CheckState.CRITICAL + self.check_message = "Current pve version '{}' ({}) is lower than the min. required version '{}'".format( + data['version'], data['repoid'], self.options.min_version) + else: + self.check_message = "Your pve instance version '{}' ({}) is up to date".format(data['version'], + data['repoid']) + + def check_memory(self): + url = self.get_url('nodes/{}/status'.format(self.options.node)) + self.check_api_value(url, 'Memory usage is', key='memory') + + def check_swap(self): + url = self.get_url('nodes/{}/status'.format(self.options.node)) + self.check_api_value(url, 'Swap usage is', key='swap') + + def check_cpu(self): + url = self.get_url('nodes/{}/status'.format(self.options.node)) + self.check_api_value(url, 'CPU usage is', key='cpu') + + def check_io_wait(self): + url = self.get_url('nodes/{}/status'.format(self.options.node)) + self.check_api_value(url, 'IO wait is', key='wait', perfkey='wait') + + def check_thresholds(self, value, message, **kwargs): + is_warning = False + is_critical = False + + if not isinstance(value, dict): + value = { None: value } + + for metric, value in value.items(): + value_warning = self.threshold_warning(metric) + if value_warning is not None: + is_warning = is_warning or value_warning.check(value, kwargs.get('lowerValue', False)) + + value_critical = self.threshold_critical(metric) + if value_critical is not None: + is_critical = is_critical or value_critical.check(value, kwargs.get('lowerValue', False)) + + if is_critical: + self.check_result = CheckState.CRITICAL + self.check_message = kwargs.get('messageCritical', message) + elif is_warning: + self.check_result = CheckState.WARNING + self.check_message = kwargs.get('messageWarning', message) + else: + self.check_message = message + + def scale_value(self, value): + if self.options.unit in self.UNIT_SCALE: + return value / self.UNIT_SCALE[self.options.unit] + else: + assert('wrong unit') + + def threshold_warning(self, name: str): + return self.options.threshold_warning.get(name, self.options.threshold_warning.get(None, None)) + + def threshold_critical(self, name: str): + return self.options.threshold_critical.get(name, self.options.threshold_critical.get(None, None)) + + def get_value(self, value, total=None): + value = float(value) + + if total: + value /= float(total) / 100 + else: + value = self.scale_value(value) + + return round(value, 2) + + def add_perfdata(self, name, value, **kwargs): + unit = kwargs.get('unit', '%') + + perfdata = '{}={}{}'.format(name, value, unit) + + threshold_warning = self.threshold_warning(name) + threshold_critical = self.threshold_critical(name) + + perfdata += ';' + if threshold_warning: + perfdata += str(threshold_warning.value) + + perfdata += ';' + if threshold_critical: + perfdata += str(threshold_critical.value) + + perfdata += ';{}'.format(kwargs.get('min', 0)) + perfdata += ';{}'.format(kwargs.get('max', '')) + + self.perfdata.append(perfdata) + + def get_perfdata(self): + perfdata = '' + + if len(self.perfdata): + perfdata = '|' + perfdata += ' '.join(self.perfdata) + + return perfdata + + def check(self): + self.check_result = CheckState.OK + + if self.options.mode == 'cluster': + self.check_cluster_status() + elif self.options.mode == 'version': + self.check_version() + elif self.options.mode == 'memory': + self.check_memory() + elif self.options.mode == 'swap': + self.check_swap() + elif self.options.mode == 'io_wait': + self.check_io_wait() + elif self.options.mode == 'disk-health': + self.check_disks() + elif self.options.mode == 'cpu': + self.check_cpu() + elif self.options.mode == 'services': + self.check_services() + elif self.options.mode == 'updates': + self.check_updates() + elif self.options.mode == 'subscription': + self.check_subscription() + elif self.options.mode == 'storage': + self.check_storage(self.options.name) + elif self.options.mode in ['vm', 'vm_status']: + only_status = self.options.mode == 'vm_status' + + if self.options.name: + idx = self.options.name + else: + idx = self.options.vmid + + if self.options.expected_vm_status: + self.check_vm_status(idx, expected_state=self.options.expected_vm_status, only_status=only_status) + else: + self.check_vm_status(idx, only_status=only_status) + elif self.options.mode == 'replication': + self.check_replication() + elif self.options.mode == 'ceph-health': + self.check_ceph_health() + elif self.options.mode == 'zfs-health': + self.check_zfs_health(self.options.name) + elif self.options.mode == 'zfs-fragmentation': + self.check_zfs_fragmentation(self.options.name) + else: + message = "Check mode '{}' not known".format(self.options.mode) + self.output(CheckState.UNKNOWN, message) + + self.check_output() + + def parse_args(self): + p = argparse.ArgumentParser(description='Check command for PVE hosts via API') + + api_opts = p.add_argument_group('API Options') + + api_opts.add_argument("-e", "--api-endpoint", required=True, help="PVE api endpoint hostname") + api_opts.add_argument("--api-port", required=False, help="PVE api endpoint port") + + api_opts.add_argument("-u", "--username", dest='api_user', required=True, + help="PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you " + "have chosen in proxmox)") + + group = api_opts.add_mutually_exclusive_group(required=True) + group.add_argument("-p", "--password", dest='api_password', help="PVE API user password") + group.add_argument("-t", "--api-token", dest="api_token", help="PVE API token (format: TOKEN_ID=TOKEN_SECRET") + + api_opts.add_argument("-k", "--insecure", dest='api_insecure', action='store_true', default=False, + help="Don't verify HTTPS certificate") + + api_opts.set_defaults(api_port=8006) + + check_opts = p.add_argument_group('Check Options') + + check_opts.add_argument("-m", "--mode", + choices=( + 'cluster', 'version', 'cpu', 'memory', 'swap', 'storage', 'io_wait', 'updates', 'services', + 'subscription', 'vm', 'vm_status', 'replication', 'disk-health', 'ceph-health', + 'zfs-health', 'zfs-fragmentation'), + required=True, + help="Mode to use.") + + check_opts.add_argument('-n', '--node', dest='node', + help='Node to check (necessary for all modes except cluster and version)') + + check_opts.add_argument('--name', dest='name', + help='Name of storage, vm, or container') + + check_opts.add_argument('--vmid', dest='vmid', type=int, + help='ID of virtual machine or container') + + check_opts.add_argument('--expected-vm-status', choices=('running', 'stopped', 'paused'), + help='Expected VM status') + + check_opts.add_argument('--ignore-vm-status', dest='ignore_vm_status', action='store_true', + help='Ignore VM status in checks', + default=False) + + check_opts.add_argument('--ignore-service', dest='ignore_services', action='append', metavar='NAME', + help='Ignore service NAME in checks', default=[]) + + check_opts.add_argument('--ignore-disk', dest='ignore_disks', action='append', metavar='NAME', + help='Ignore disk NAME in health check', default=[]) + + check_opts.add_argument('-w', '--warning', dest='threshold_warning', type=CheckThreshold.threshold_type, + default={}, help='Warning threshold for check value. Mutiple thresholds with name:value,name:value') + check_opts.add_argument('-c', '--critical', dest='threshold_critical', type=CheckThreshold.threshold_type, + default={}, help='Critical threshold for check value. Mutiple thresholds with name:value,name:value') + check_opts.add_argument('-M', dest='values_mb', action='store_true', default=False, + help='Values are shown in the unit which is set with --unit (if available). Thresholds are also treated in this unit') + check_opts.add_argument('-V', '--min-version', dest='min_version', type=str, + help='The minimal pve version to check for. Any version lower than this will return ' + 'CRITICAL.') + + check_opts.add_argument('--unit', choices=self.UNIT_SCALE.keys(), default='MiB', help='Unit which is used for performance data and other values') + + options = p.parse_args() + + if not options.node and options.mode not in ['cluster', 'vm', 'vm_status', 'version', 'ceph-health']: + p.print_usage() + message = "{}: error: --mode {} requires node name (--node)".format(p.prog, options.mode) + self.output(CheckState.UNKNOWN, message) + + if not options.vmid and not options.name and options.mode in ('vm', 'vm_status'): + p.print_usage() + message = "{}: error: --mode {} requires either vm name (--name) or id (--vmid)".format(p.prog, + options.mode) + self.output(CheckState.UNKNOWN, message) + + if not options.name and options.mode == 'storage': + p.print_usage() + message = "{}: error: --mode {} requires storage name (--name)".format(p.prog, options.mode) + self.output(CheckState.UNKNOWN, message) + + def compare_thresholds(threshold_warning, threshold_critical, comparator): + ok = True + keys = set(list(threshold_warning.keys()) + list(threshold_critical.keys())) + for key in keys: + if (key in threshold_warning and key in threshold_critical) or (None in threshold_warning and None in threshold_critical): + ok = ok and comparator(threshold_warning[key], threshold_critical[key]) + elif key in threshold_warning and None in threshold_critical: + ok = ok and comparator(threshold_warning[key], threshold_critical[None]) + elif key in threshold_critical and None in threshold_warning: + ok = ok and comparator(threshold_warning[None], threshold_critical[key]) + + return ok + + if options.threshold_warning and options.threshold_critical: + if options.mode != 'subscription' and not compare_thresholds(options.threshold_warning, options.threshold_critical, lambda w,c: w<=c): + p.error("Critical value must be greater than warning value") + elif options.mode == 'subscription' and not compare_thresholds(options.threshold_warning, options.threshold_critical, lambda w,c: w>=c): + p.error("Critical value must be lower than warning value") + + self.options = options + + def __init__(self): + self.options = {} + self.ticket = None + self.perfdata = [] + self.check_result = CheckState.UNKNOWN + self.check_message = "" + + self.__headers = {} + self.__cookies = {} + + self.parse_args() + + if self.options.api_insecure: + # disable urllib3 warning about insecure requests + requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) + + if self.options.api_password is not None: + self.__cookies['PVEAuthCookie'] = self.get_ticket() + elif self.options.api_token is not None: + self.__headers["Authorization"] = "PVEAPIToken={}!{}".format(self.options.api_user, self.options.api_token) + +pve = CheckPVE() +pve.check() diff --git a/check_pve/grafana/pve-metrics-dashboard.json b/check_pve/grafana/pve-metrics-dashboard.json new file mode 100644 index 0000000..76703f5 --- /dev/null +++ b/check_pve/grafana/pve-metrics-dashboard.json @@ -0,0 +1,973 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": 11, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "icinga2", + "fill": 1, + "id": 1, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "CRITICAL", + "color": "#BF1B00", + "fill": 0, + "legend": false + }, + { + "alias": "WARNING", + "color": "#EAB839", + "fill": 0, + "legend": false + }, + { + "alias": "memory used", + "color": "#0A437C", + "yaxis": 2 + }, + { + "alias": "memory used", + "fill": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$service usage", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "usage" + } + ] + }, + { + "alias": "WARNING", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", + "rawQuery": false, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "warn" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "usage" + } + ] + }, + { + "alias": "CRITICAL", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", + "rawQuery": false, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "crit" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "usage" + } + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$service usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "% usage", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": "used MB", + "logBase": 1, + "max": null, + "min": "0", + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "icinga2", + "fill": 1, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "CRITICAL", + "color": "#BF1B00", + "fill": 0, + "legend": false + }, + { + "alias": "WARNING", + "color": "#EAB839", + "fill": 0, + "legend": false + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$service used", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "used" + } + ] + }, + { + "alias": "WARNING", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", + "rawQuery": false, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "warn" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "used" + } + ] + }, + { + "alias": "CRITICAL", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", + "rawQuery": false, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "crit" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "used" + } + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$service used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "used", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": "used MB", + "logBase": 1, + "max": null, + "min": "0", + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "icinga2", + "fill": 1, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "CRITICAL", + "color": "#BF1B00", + "fill": 0, + "legend": false + }, + { + "alias": "WARNING", + "color": "#EAB839", + "fill": 0, + "legend": false + }, + { + "alias": "memory used", + "color": "#0A437C", + "yaxis": 2 + }, + { + "alias": "memory used", + "fill": 0 + } + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "I/O wait", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "wait" + } + ] + }, + { + "alias": "WARNING", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", + "rawQuery": false, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "warn" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "wait" + } + ] + }, + { + "alias": "CRITICAL", + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "metric" + ], + "type": "tag" + }, + { + "params": [ + "none" + ], + "type": "fill" + } + ], + "hide": false, + "measurement": "pve", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", + "rawQuery": false, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "crit" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [ + { + "key": "hostname", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "service", + "operator": "=~", + "value": "/^$service$/" + }, + { + "condition": "AND", + "key": "metric", + "operator": "=", + "value": "wait" + } + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "I/O wait", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "% usage", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": "used MB", + "logBase": 1, + "max": null, + "min": "0", + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "icmp checks", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "pve01.willi-graf.local", + "value": "pve01.willi-graf.local" + }, + "datasource": "icinga2", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "hostname", + "options": [], + "query": "SHOW TAG VALUES WITH KEY = \"hostname\"", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "io_wait", + "value": "io_wait" + }, + "datasource": "icinga2", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "service", + "options": [], + "query": "SHOW TAG VALUES WITH KEY = \"service\" where hostname =~ /^$hostname$/", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-2m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "icinga-pve-metrics", + "version": 23 +} diff --git a/check_pve/icinga2/command.conf b/check_pve/icinga2/command.conf new file mode 100644 index 0000000..c9415c4 --- /dev/null +++ b/check_pve/icinga2/command.conf @@ -0,0 +1,74 @@ +object CheckCommand "pve" { + import "plugin-check-command" + + command = [ PluginDir + "/check_pve.py" ] + + arguments = { + "-e" = { + value = "$pve_host$" + required = true + description = "Hostname for PVE API" + } + "-u" = { + value = "$pve_user$" + required = true + description = "API user (ex. monitoring@pve)" + } + "-p" = { + value = "$pve_password$" + required = true + description = "API user password" + } + "-k" = { + set_if = "$pve_insecure_connection$" + description = "Connect to this host instead of $pve_host$" + } + "-m" = { + value = "$pve_mode$" + required = true + description = "Check mode (cluster, version, updates, subscription, storage, cpu, memory, io_wait, vm, replication)" + } + "-n" = { + value = "$pve_node$" + description = "Node to check (necessary for all modes except cluster and version)" + } + "--name" = { + value = "$pve_resource_name$" + description = "Name of storage or vm to check" + } + "--expected-vm-status" = { + value = "$pve_expected_vm_status$" + description = "Expected status of the VM" + } + "--ignore-service" = { + repeat_key = true + value = "$pve_ignore_services$" + description = "Ignore services in check" + } + "--ignore-disk" = { + repeat_key = true + value = "$pve_ignore_disks$" + description = "Ignore disks in check" + } + "--ignore-vm-status" = { + set_if = "$pve_ignore_vm_status$" + description = "Ignore VM status in check" + } + "-w" = { + value = "$pve_warning$" + description = "Warning treshold" + } + "-c" = { + value = "$pve_critical$" + description = "Critical treshold" + } + "-M" = { + set_if = "$pve_tresholds_mb$" + description = "Unit of tresholds and values is MB" + } + "-V" = { + value = "$pve_min_version$" + description = "Minimal pve version. Everything lower than this will return CRITICAL." + } + } +} diff --git a/check_pve/icinga2/service.conf b/check_pve/icinga2/service.conf new file mode 100644 index 0000000..47fadc4 --- /dev/null +++ b/check_pve/icinga2/service.conf @@ -0,0 +1,139 @@ +template Host "proxmox-host" { + import "generic-host" + + vars.pve_host = name + vars.pve_node = name.split(".")[0] + // ... or if not matching the fqdn (nodename.domain.example) + // vars.pve_node = "proxmox-host" + + // if your icinga host don't trust your pve certificate, you'll have to uncomment this line + // vars.pve_insecure_connection = true + vars.pve_user = "monitor@pve" + vars.pve_password = "SuperSecretPassw0rd" + + // change to false, if node is no member of a pve cluster + vars.pve_cluster = true +} + +object Host "proxmox-host.domain.example" { + import "proxmox-host" + + address = "192.168.42.42" + + vars.pve_storage["flashpool"] = { + pve_warning = 80 + pve_critical = 90 + } + + vars.pve_storage["diskpool"] = { + pve_warning = 80 + pve_critical = 90 + } + + // Ignore these disks in health check (USB sticks, SD cards, etc.) + vars.pve_ignore_disks = [ "sdn", "sdg" ] + + vars.virtual_machines["vm-01"] = { + } +} + +template Service "pve-service" { + import "generic-service" + + check_command = "pve" +} + +apply Service "cluster" { + import "pve-service" + + vars.pve_mode = "cluster" + + assign where host.vars.pve_host && host.vars.pve_cluster +} + +apply Service "services" { + import "pve-service" + + vars.pve_mode = "services" + + // Ignore cluster status on single nodes + if (!host.vars.pve_cluster) { + vars.pve_ignore_services = host.vars.pve_ignore_services || [] + vars.pve_ignore_services.add("corosync") + } + + assign where host.vars.pve_host +} + +apply Service "updates" { + import "pve-service" + + check_interval = 12h + retry_interval = 2h + max_check_attempts = 3 + + vars.pve_mode = "updates" + + assign where host.vars.pve_host +} + +apply Service "disk-health" { + import "pve-service" + + vars.pve_mode = "disk-health" + + assign where host.vars.pve_host +} + +apply Service "io_wait" { + import "pve-service" + + vars.pve_mode = "io_wait" + + vars.pve_warning = 10 + vars.pve_critical = 30 + + assign where host.vars.pve_host +} + +apply Service "cpu" { + import "pve-service" + + vars.pve_mode = "cpu" + + vars.pve_warning = 70 + vars.pve_critical = 90 + + assign where host.vars.pve_host +} + +apply Service "memory" { + import "pve-service" + + vars.pve_mode = "memory" + + vars.pve_warning = 80 + vars.pve_critical = 90 + + assign where host.vars.pve_host +} + +apply Service "storage " for (storage => config in host.vars.pve_storage) { + import "pve-service" + + vars += config + + vars.pve_mode = "storage" + vars.pve_resource_name = storage +} + +apply Service "pve-vm " for (vm => config in host.vars.virtual_machines) { + import "pve-service" + + vars += config + + vars.pve_mode = "vm" + vars.pve_resource_name = vm + + assign where host.vars.pve_host +} diff --git a/check_pve/requirements.txt b/check_pve/requirements.txt new file mode 100644 index 0000000..8bb6152 --- /dev/null +++ b/check_pve/requirements.txt @@ -0,0 +1,3 @@ +requests +argparse +packaging diff --git a/checker/synapse_client.py b/checker/synapse_client.py index e2a8b04..dd3caf8 100644 --- a/checker/synapse_client.py +++ b/checker/synapse_client.py @@ -9,7 +9,7 @@ import aiofiles.os import magic import markdown from PIL import Image -from nio import AsyncClient, LoginResponse, RoomSendError, UploadResponse, MatrixRoom, RoomLeaveResponse, RoomForgetResponse +from nio import AsyncClient, LoginResponse, MatrixRoom, RoomForgetResponse, RoomLeaveResponse, RoomSendError, UploadResponse from . import nagios @@ -166,7 +166,6 @@ async def leave_all_rooms_async(client, exclude_starting_with=None): await client.sync() invited_rooms = copy.copy(client.invited_rooms) # RuntimeError: dictionary changed size during iteration for name, room in invited_rooms.items(): - print(room.room_id) # if exclude_starting_with and room.named_room_name() is not None and room.named_room_name().startswith(exclude_starting_with): # continue s, l, f = await leave_room_async(room.room_id, client)