From 7f95ce919e277a74ba8f1367aaa893b35902e0f9 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sat, 16 Sep 2023 13:33:45 -0600 Subject: [PATCH] check_iperf3: add retry if iperf3 server is busy\ check_matrix_synapse: fix --- check_iperf3.sh | 38 ++++++++++++++++++++++++++------------ checker/synapse_grafana.py | 28 ++++++++++++++-------------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/check_iperf3.sh b/check_iperf3.sh index 133211a..ea19fc3 100755 --- a/check_iperf3.sh +++ b/check_iperf3.sh @@ -34,7 +34,6 @@ # Warnng and critical levels are based on your specific network speed. - # Default values SERVER="" WARNING_LEVEL="" @@ -42,8 +41,8 @@ CRITICAL_LEVEL="" RSA_PUBLIC_KEY="" USERNAME="" PASSWORD="" +RETRY=3 -# Parse named arguments while [[ $# -gt 0 ]]; do key="$1" @@ -78,6 +77,11 @@ while [[ $# -gt 0 ]]; do shift shift ;; + --retry) + RETRY="$2" + shift + shift + ;; *) shift ;; @@ -99,17 +103,27 @@ fi export IPERF3_PASSWORD="$PASSWORD" # Run iperf3 command with optional arguments -if [[ -n "$RSA_PUBLIC_KEY" ]] && [[ -n "$USERNAME" ]]; then - OUTPUT=$(iperf3 -c "$SERVER" -i 1 -t 10 -f m --rsa-public-key-path "$RSA_PUBLIC_KEY" --username "$USERNAME" 2>&1) -else - OUTPUT=$(iperf3 -c "$SERVER" -i 1 -t 10 -f m 2>&1) -fi +for ((i = 1; i <= RETRY; i++)); do + if [[ -n "$RSA_PUBLIC_KEY" ]] && [[ -n "$USERNAME" ]]; then + OUTPUT=$(iperf3 -c "$SERVER" -i 1 -t 30 -f m --rsa-public-key-path "$RSA_PUBLIC_KEY" --username "$USERNAME" 2>&1) + else + OUTPUT=$(iperf3 -c "$SERVER" -i 1 -t 10 -f m 2>&1) + fi -# Check if iperf3 command failed -if [[ $? -ne 0 ]]; then - echo -e "UNKNOWN - iperf3 command failed: $OUTPUT\n" - exit -1 -fi + # Check if iperf3 command failed + if [[ $? -ne 0 ]]; then + if [[ $OUTPUT == *"the server is busy running a test. try again later"* ]]; then + if [[ $i -lt $RETRY ]]; then + sleep 30 + continue + fi + fi + echo -e "UNKNOWN - iperf3 command failed: $OUTPUT\n" + exit -1 + else + break + fi +done # Extract the receiver bitrate RECEIVER_BITRATE=$(echo "$OUTPUT" | grep -Eo '[0-9]+(\.[0-9]+)? Mbits/sec' | tail -1 | awk '{print $1}') diff --git a/checker/synapse_grafana.py b/checker/synapse_grafana.py index b47d348..af85575 100644 --- a/checker/synapse_grafana.py +++ b/checker/synapse_grafana.py @@ -23,7 +23,7 @@ def get_avg_python_gc_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'rate(python_gc_time_sum{instance="172.0.2.118:9000",job=~"synapse",index=~".*"}[2m])/rate(python_gc_time_count[2m])', + 'expr': 'rate(python_gc_time_sum{instance="matrix.synapse",job=~".*",index=~".*"}[2m])/rate(python_gc_time_count[2m])', 'format': 'time_series', 'intervalFactor': 2, 'refId': 'A', @@ -69,7 +69,7 @@ def get_outgoing_http_request_rate(api_key, interval, data_range, endpoint): 'uid': 'DAMPdbiIz' }, 'editorMode': 'code', - 'expr': 'rate(synapse_http_client_requests_total{job=~"synapse",index=~".*",instance="172.0.2.118:9000"}[2m])', + 'expr': 'rate(synapse_http_client_requests_total{job=~".*",index=~".*",instance="matrix.synapse"}[2m])', 'range': True, 'refId': 'A', 'interval': '', @@ -84,7 +84,7 @@ def get_outgoing_http_request_rate(api_key, interval, data_range, endpoint): 'uid': 'DAMPdbiIz' }, 'editorMode': 'code', - 'expr': 'rate(synapse_http_matrixfederationclient_requests_total{job=~"synapse",index=~".*",instance="172.0.2.118:9000"}[2m])', + 'expr': 'rate(synapse_http_matrixfederationclient_requests_total{job=~".*",index=~".*",instance="matrix.synapse"}[2m])', 'range': True, 'refId': 'B', 'interval': '', @@ -114,7 +114,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'histogram_quantile(0.99, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) by (le))', + 'expr': 'histogram_quantile(0.99, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) by (le))', 'format': 'time_series', 'intervalFactor': 1, 'refId': 'D', @@ -134,7 +134,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'histogram_quantile(0.9, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) by (le))', + 'expr': 'histogram_quantile(0.9, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) by (le))', 'format': 'time_series', 'interval': '', 'intervalFactor': 1, @@ -151,7 +151,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'histogram_quantile(0.75, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) by (le))', + 'expr': 'histogram_quantile(0.75, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) by (le))', 'format': 'time_series', 'intervalFactor': 1, 'refId': 'C', @@ -168,7 +168,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'histogram_quantile(0.5, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) by (le))', + 'expr': 'histogram_quantile(0.5, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) by (le))', 'format': 'time_series', 'intervalFactor': 1, 'refId': 'B', @@ -185,7 +185,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'histogram_quantile(0.25, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) by (le))', + 'expr': 'histogram_quantile(0.25, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) by (le))', 'refId': 'F', 'interval': '', 'queryType': 'timeSeriesQuery', @@ -200,7 +200,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'histogram_quantile(0.05, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) by (le))', + 'expr': 'histogram_quantile(0.05, sum(rate(synapse_http_server_response_time_seconds_bucket{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) by (le))', 'refId': 'G', 'interval': '', 'queryType': 'timeSeriesQuery', @@ -215,7 +215,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'sum(rate(synapse_http_server_response_time_seconds_sum{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m])) / sum(rate(synapse_http_server_response_time_seconds_count{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="172.0.2.118:9000",code=~"2.."}[2m]))', + 'expr': 'sum(rate(synapse_http_server_response_time_seconds_sum{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m])) / sum(rate(synapse_http_server_response_time_seconds_count{servlet=\'RoomSendEventRestServlet\',index=~".*",instance="matrix.synapse",code=~"2.."}[2m]))', 'refId': 'H', 'interval': '', 'queryType': 'timeSeriesQuery', @@ -230,7 +230,7 @@ def get_event_send_time(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'sum(rate(synapse_storage_events_persisted_events_total{instance="172.0.2.118:9000"}[2m]))', + 'expr': 'sum(rate(synapse_storage_events_persisted_events_total{instance="matrix.synapse"}[2m]))', 'hide': False, 'instant': False, 'refId': 'E', @@ -261,7 +261,7 @@ def get_waiting_for_db(api_key, interval, data_range, endpoint): 'type': 'prometheus', 'uid': 'DAMPdbiIz', }, - 'expr': 'rate(synapse_storage_schedule_time_sum{instance="172.0.2.118:9000",job=~"synapse",index=~".*"}[30s])/rate(synapse_storage_schedule_time_count[30s])', + 'expr': 'rate(synapse_storage_schedule_time_sum{instance="matrix.synapse",job=~".*",index=~".*"}[30s])/rate(synapse_storage_schedule_time_count[30s])', 'format': 'time_series', 'intervalFactor': 2, 'refId': 'A', @@ -305,7 +305,7 @@ def get_stateres_worst_case(api_key, interval, data_range, endpoint): 'uid': 'DAMPdbiIz', }, 'exemplar': False, - 'expr': 'sum(rate(synapse_state_res_db_for_biggest_room_seconds_total{instance="172.0.2.118:9000"}[1m]))', + 'expr': 'sum(rate(synapse_state_res_db_for_biggest_room_seconds_total{instance="matrix.synapse"}[1m]))', 'format': 'time_series', 'hide': False, 'instant': False, @@ -324,7 +324,7 @@ def get_stateres_worst_case(api_key, interval, data_range, endpoint): 'uid': 'DAMPdbiIz', }, 'exemplar': False, - 'expr': 'sum(rate(synapse_state_res_cpu_for_biggest_room_seconds_total{instance="172.0.2.118:9000"}[1m]))', + 'expr': 'sum(rate(synapse_state_res_cpu_for_biggest_room_seconds_total{instance="matrix.synapse"}[1m]))', 'format': 'time_series', 'hide': False, 'instant': False,