check_pve: option to ignore unknown disks

check_scrtiny_disks: draft
This commit is contained in:
Cyberes 2023-05-28 12:50:04 -06:00
parent f021c8cddd
commit 357f1f2d9e
2 changed files with 86 additions and 14 deletions

View File

@ -23,8 +23,8 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
import sys
import re import re
import sys
try: try:
from enum import Enum from enum import Enum
@ -259,6 +259,7 @@ class CheckPVE:
continue continue
if disk['health'] == 'UNKNOWN': if disk['health'] == 'UNKNOWN':
if not self.options.ignore_unknown_disks:
self.check_result = CheckState.WARNING self.check_result = CheckState.WARNING
unknown.append({"serial": disk["serial"], "device": disk['devpath']}) unknown.append({"serial": disk["serial"], "device": disk['devpath']})
@ -755,6 +756,8 @@ class CheckPVE:
check_opts.add_argument('--unit', choices=self.UNIT_SCALE.keys(), default='MiB', help='Unit which is used for performance data and other values') check_opts.add_argument('--unit', choices=self.UNIT_SCALE.keys(), default='MiB', help='Unit which is used for performance data and other values')
check_opts.add_argument('--ignore-unknown-disks', action='store_true', help='Skip checking disks that have an unknown health status (usually because they don\'t support SMART.')
options = p.parse_args() options = p.parse_args()
if not options.node and options.mode not in ['cluster', 'vm', 'vm_status', 'version', 'ceph-health']: if not options.node and options.mode not in ['cluster', 'vm', 'vm_status', 'version', 'ceph-health']:
@ -815,5 +818,6 @@ class CheckPVE:
elif self.options.api_token is not None: elif self.options.api_token is not None:
self.__headers["Authorization"] = "PVEAPIToken={}!{}".format(self.options.api_user, self.options.api_token) self.__headers["Authorization"] = "PVEAPIToken={}!{}".format(self.options.api_user, self.options.api_token)
pve = CheckPVE() pve = CheckPVE()
pve.check() pve.check()

68
check_scrutiny_disks.py Executable file
View File

@ -0,0 +1,68 @@
#!/usr/bin/env python3
import argparse
import json
import subprocess
import sys
from typing import List
from checker import nagios
import requests
def get_disk_wwn_ids() -> List[str]:
wwn_ids = []
try:
output = subprocess.check_output(["lsblk", "-o", "NAME,WWN,TYPE", "-d", "-n", "-p"])
for line in output.decode("utf-8").strip().split("\n"):
parts = line.split()
if len(parts) == 3:
name, wwn, disk_type = parts
if wwn != "0" and disk_type == "disk":
smart_supported = subprocess.check_output(["smartctl", "-i", name]).decode("utf-8")
if "SMART support is: Enabled" in smart_supported:
wwn_ids.append(wwn)
except subprocess.CalledProcessError as e:
print(f"Subprocess Error: {e}")
return wwn_ids
def get_smart_health(wwn_id: str, scrutiny_endpoint: str) -> dict:
url = f"{scrutiny_endpoint}/api/device/{wwn_id}/details"
response = requests.get(url)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
print(f"Disk {wwn_id} not found on Scrutiny")
return {}
else:
print(f"Scrutiny Error {response.status_code} for disk {wwn_id}: {response.text}")
return {}
def main(scrutiny_endpoint: str):
results = {}
wwn_ids = get_disk_wwn_ids()
for wwn_id in wwn_ids:
smart_health = get_smart_health(wwn_id, scrutiny_endpoint)
if smart_health:
print(f"Disk {wwn_id} SMART health:")
print(json.dumps(smart_health, indent=2))
for metric in smart_health['data']['smart_results'][0]['attrs']:
print(metric)
results[smart_health['data']['device']['device_name']] = {}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='')
parser.add_argument('--scrutiny-endpoint', required=True, help='Base URL for scrutiny.')
args = parser.parse_args()
args.scrutiny_endpoint = args.scrutiny_endpoint.strip('/')
try:
main(args.scrutiny_endpoint)
except Exception as e:
print(f'UNKNOWN: exception "{e}"')
import traceback
print(traceback.format_exc())
sys.exit(nagios.UNKNOWN)