Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| d9bed3832c |
@@ -1,344 +0,0 @@
|
||||
---
|
||||
# ============================================================================
|
||||
# wan-monitor — internet connection quality monitoring
|
||||
# Single pod (3 containers) in admin-system:
|
||||
# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115
|
||||
# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared
|
||||
# - metrics-http : busybox httpd serving /shared/metrics :9116
|
||||
# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics).
|
||||
# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml).
|
||||
# ============================================================================
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: wan-monitor-blackbox
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
data:
|
||||
blackbox.yml: |
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
preferred_ip_protocol: ip4
|
||||
ip_protocol_fallback: false
|
||||
method: GET
|
||||
fail_if_not_ssl: false
|
||||
icmp:
|
||||
prober: icmp
|
||||
timeout: 5s
|
||||
icmp:
|
||||
preferred_ip_protocol: ip4
|
||||
ip_protocol_fallback: false
|
||||
dns_udp:
|
||||
prober: dns
|
||||
timeout: 5s
|
||||
dns:
|
||||
transport_protocol: udp
|
||||
preferred_ip_protocol: ip4
|
||||
query_name: "telex.hu"
|
||||
query_type: "A"
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: wan-monitor-scripts
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
data:
|
||||
metrics-header.prom: |
|
||||
# HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds)
|
||||
# TYPE wan_irtt_rtt_seconds gauge
|
||||
# HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds)
|
||||
# TYPE wan_irtt_jitter_seconds gauge
|
||||
# HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1)
|
||||
# TYPE wan_irtt_loss_ratio gauge
|
||||
# HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1)
|
||||
# TYPE wan_irtt_late_ratio gauge
|
||||
# HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1)
|
||||
# TYPE wan_irtt_duplicate_ratio gauge
|
||||
# HELP wan_irtt_packets irtt packet counters for the run
|
||||
# TYPE wan_irtt_packets gauge
|
||||
# HELP wan_irtt_success 1 if the irtt run produced stats
|
||||
# TYPE wan_irtt_success gauge
|
||||
# HELP wan_throughput_bits_per_second achieved throughput (bits/sec)
|
||||
# TYPE wan_throughput_bits_per_second gauge
|
||||
# HELP wan_throughput_success 1 if the throughput test succeeded
|
||||
# TYPE wan_throughput_success gauge
|
||||
# HELP wan_probe_last_run_timestamp_seconds unix time of last probe run
|
||||
# TYPE wan_probe_last_run_timestamp_seconds gauge
|
||||
|
||||
irtt_to_prom.py: |
|
||||
#!/usr/bin/env python3
|
||||
# irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static).
|
||||
# args: <condition> <target>
|
||||
# Hardened: tolerates missing/null/NaN/Inf fields and clamps ratios to 0..1
|
||||
# so a pathological irtt run can never emit an out-of-range or invalid sample.
|
||||
import json, sys, time, math
|
||||
|
||||
cond = sys.argv[1] if len(sys.argv) > 1 else "idle"
|
||||
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
||||
L = f'target="{target}",condition="{cond}"'
|
||||
ts = f'{time.time():.0f}'
|
||||
|
||||
def num(x, default=0.0):
|
||||
# finite float or default (handles None / str / missing / NaN / Inf)
|
||||
try:
|
||||
v = float(x)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
return v if math.isfinite(v) else default
|
||||
|
||||
def pct_ratio(x):
|
||||
# percent (0..100, possibly garbage) -> ratio clamped to 0..1
|
||||
return max(0.0, min(1.0, num(x) / 100.0))
|
||||
|
||||
def fail():
|
||||
print(f'wan_irtt_success{{{L}}} 0')
|
||||
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
s = json.load(sys.stdin).get("stats")
|
||||
except Exception:
|
||||
fail()
|
||||
if not isinstance(s, dict):
|
||||
fail()
|
||||
|
||||
rtt = s.get("rtt") or {}
|
||||
for k in ("min", "mean", "median", "max", "stddev"):
|
||||
print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {num(rtt.get(k)) / 1e9}')
|
||||
|
||||
def ipdv(key):
|
||||
d = s.get(key) or {}
|
||||
return num(d.get("mean")) / 1e9
|
||||
|
||||
print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {ipdv("ipdv_round_trip")}')
|
||||
print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {ipdv("ipdv_send")}')
|
||||
print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {ipdv("ipdv_receive")}')
|
||||
|
||||
print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {pct_ratio(s.get("packet_loss_percent"))}')
|
||||
print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {pct_ratio(s.get("upstream_loss_percent"))}')
|
||||
print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {pct_ratio(s.get("downstream_loss_percent"))}')
|
||||
print(f'wan_irtt_late_ratio{{{L}}} {pct_ratio(s.get("late_packets_percent"))}')
|
||||
print(f'wan_irtt_duplicate_ratio{{{L}}} {pct_ratio(s.get("duplicate_percent"))}')
|
||||
|
||||
print(f'wan_irtt_packets{{{L},kind="sent"}} {int(num(s.get("packets_sent")))}')
|
||||
print(f'wan_irtt_packets{{{L},kind="received"}} {int(num(s.get("packets_received")))}')
|
||||
print(f'wan_irtt_packets{{{L},kind="server_received"}} {int(num(s.get("server_packets_received")))}')
|
||||
|
||||
print(f'wan_irtt_success{{{L}}} 1')
|
||||
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
||||
|
||||
tput_to_prom.py: |
|
||||
#!/usr/bin/env python3
|
||||
# iperf3 JSON (stdin) -> Prometheus sample lines. args: <direction> <target>
|
||||
import json, sys, time
|
||||
direction = sys.argv[1] if len(sys.argv) > 1 else "download"
|
||||
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
||||
L = f'target="{target}",direction="{direction}"'
|
||||
ts = f'{time.time():.0f}'
|
||||
try:
|
||||
bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"]
|
||||
print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}')
|
||||
print(f'wan_throughput_success{{{L}}} 1')
|
||||
except Exception:
|
||||
print(f'wan_throughput_success{{{L}}} 0')
|
||||
print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}')
|
||||
|
||||
probe-loop.sh: |
|
||||
#!/bin/sh
|
||||
set -u
|
||||
SHARED=/shared
|
||||
HDR=/scripts/metrics-header.prom
|
||||
HETZNER="${HETZNER_HOST:?set HETZNER_HOST}"
|
||||
IRTT_PORT="${IRTT_PORT:-2112}"
|
||||
IPERF_PORT="${IPERF_PORT:-5201}"
|
||||
IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}"
|
||||
IRTT_DURATION="${IRTT_DURATION:-60}" # seconds (numeric, for timeout math)
|
||||
TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests
|
||||
TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction
|
||||
IRTT_TARGET="${IRTT_TARGET:-hetzner}"
|
||||
TPUT_TARGET="${TPUT_TARGET:-hetzner}"
|
||||
HMAC_OPT=""
|
||||
[ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}"
|
||||
|
||||
mkdir -p "$SHARED"
|
||||
: > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom"
|
||||
cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404
|
||||
|
||||
# Concatenate fragments into the served file via temp + atomic rename.
|
||||
assemble() {
|
||||
cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \
|
||||
> "$SHARED/.metrics.tmp" 2>/dev/null
|
||||
mv "$SHARED/.metrics.tmp" "$SHARED/metrics"
|
||||
}
|
||||
|
||||
# Each fragment is written to <file>.tmp then renamed, so assemble() never
|
||||
# cats a partially written file (the cause of the impossible loss spikes).
|
||||
run_irtt() { # $1 condition $2 outfile $3 duration(seconds)
|
||||
timeout "$(( $3 + 25 ))" irtt client -i "$IRTT_INTERVAL" -d "${3}s" -q $HMAC_OPT \
|
||||
-o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \
|
||||
| python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2.tmp"
|
||||
mv "$2.tmp" "$2"
|
||||
}
|
||||
|
||||
run_tput() {
|
||||
P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill the pipe over the RTT
|
||||
TO="$(( TPUT_TIME + 20 ))"
|
||||
TMP="$SHARED/.tput.prom.partial"
|
||||
: > "$TMP"
|
||||
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -R -J 2>/dev/null \
|
||||
| python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$TMP"
|
||||
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -J 2>/dev/null \
|
||||
| python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$TMP"
|
||||
mv "$TMP" "$SHARED/.tput.prom"
|
||||
}
|
||||
|
||||
last_tput=0
|
||||
while true; do
|
||||
run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence
|
||||
assemble
|
||||
now=$(date +%s)
|
||||
if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then
|
||||
LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))
|
||||
run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat
|
||||
LOADPID=$!
|
||||
run_tput
|
||||
wait "$LOADPID" 2>/dev/null
|
||||
last_tput="$now"
|
||||
assemble
|
||||
fi
|
||||
done
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: wan-monitor
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate
|
||||
selector:
|
||||
matchLabels:
|
||||
app: wan-monitor
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: wan-monitor
|
||||
annotations:
|
||||
enable.version-checker.io/blackbox: "true"
|
||||
enable.version-checker.io/metrics-http: "true"
|
||||
enable.version-checker.io/wan-probe: "true"
|
||||
match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
containers:
|
||||
- name: blackbox
|
||||
image: quay.io/prometheus/blackbox-exporter:v0.28.0
|
||||
args:
|
||||
- --config.file=/etc/blackbox/blackbox.yml
|
||||
- --web.listen-address=:9115
|
||||
ports:
|
||||
- name: blackbox
|
||||
containerPort: 9115
|
||||
securityContext:
|
||||
capabilities:
|
||||
add: ["NET_RAW"] # required for the ICMP prober
|
||||
resources:
|
||||
requests: { cpu: 10m, memory: 32Mi }
|
||||
limits: { memory: 64Mi }
|
||||
volumeMounts:
|
||||
- name: blackbox-config
|
||||
mountPath: /etc/blackbox
|
||||
readOnly: true
|
||||
|
||||
- name: wan-probe
|
||||
# Build + push from Dockerfile.wan-probe (adjust registry/tag to taste)
|
||||
image: gitea.dooplex.hu/admin/wan-probe:0.1.0
|
||||
command: ["/bin/sh", "/scripts/probe-loop.sh"]
|
||||
env:
|
||||
- name: HETZNER_HOST
|
||||
# MUST be the Hetzner origin: a DNS-only (grey-cloud) record or raw IP.
|
||||
# NOT the Cloudflare-proxied jarrs.eu — CF only forwards HTTP/HTTPS, so
|
||||
# UDP 2112 (irtt) / TCP 5201 (iperf3) never reach the origin behind it.
|
||||
value: "metrics.jarrs.eu" # DNS-only A record -> Hetzner IPv4
|
||||
- name: IRTT_PORT
|
||||
value: "2112"
|
||||
- name: IPERF_PORT
|
||||
value: "5201"
|
||||
- name: IRTT_INTERVAL
|
||||
value: "20ms"
|
||||
- name: IRTT_DURATION
|
||||
value: "60" # seconds (numeric)
|
||||
- name: TPUT_EVERY
|
||||
value: "900" # 15 min
|
||||
- name: TPUT_TIME
|
||||
value: "10"
|
||||
- name: IPERF_PARALLEL
|
||||
value: "4"
|
||||
- name: IRTT_HMAC # shared key; apply via secret (see below)
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: wan-monitor-irtt
|
||||
key: hmac
|
||||
optional: true
|
||||
resources:
|
||||
requests: { cpu: 20m, memory: 48Mi }
|
||||
limits: { memory: 96Mi }
|
||||
volumeMounts:
|
||||
- name: scripts
|
||||
mountPath: /scripts
|
||||
readOnly: true
|
||||
- name: shared
|
||||
mountPath: /shared
|
||||
|
||||
- name: metrics-http
|
||||
image: busybox:1.36
|
||||
command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"]
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 9116
|
||||
resources:
|
||||
requests: { cpu: 5m, memory: 8Mi }
|
||||
limits: { memory: 24Mi }
|
||||
volumeMounts:
|
||||
- name: shared
|
||||
mountPath: /shared
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: blackbox-config
|
||||
configMap:
|
||||
name: wan-monitor-blackbox
|
||||
- name: scripts
|
||||
configMap:
|
||||
name: wan-monitor-scripts
|
||||
- name: shared
|
||||
emptyDir: {}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: wan-monitor
|
||||
namespace: admin-system
|
||||
labels:
|
||||
app: wan-monitor
|
||||
spec:
|
||||
type: ClusterIP
|
||||
selector:
|
||||
app: wan-monitor
|
||||
ports:
|
||||
- name: blackbox
|
||||
port: 9115
|
||||
targetPort: 9115
|
||||
- name: metrics
|
||||
port: 9116
|
||||
targetPort: 9116
|
||||
@@ -237,75 +237,6 @@ data:
|
||||
regex: 'ak-outpost-(.*)-outpost'
|
||||
replacement: '$1'
|
||||
|
||||
# --- end-to-end latency + loss (ICMP) to many destinations ---
|
||||
- job_name: 'wan-icmp'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [icmp]
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets:
|
||||
- 8.8.8.8
|
||||
- 1.1.1.1
|
||||
- jarrs.eu # Hetzner
|
||||
- telex.hu
|
||||
- store.steampowered.com
|
||||
- 192.168.0.1 # gateway
|
||||
- 37.191.56.193 # your public IP (update if it changes)
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: wan-monitor.admin-system:9115
|
||||
|
||||
# --- HTTP phase breakdown (dns/connect/tls/processing/transfer) ---
|
||||
- job_name: 'wan-http'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://telex.hu
|
||||
- https://store.steampowered.com
|
||||
- https://jarrs.eu
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: wan-monitor.admin-system:9115
|
||||
|
||||
# --- DNS resolution time per resolver (Pi-hole vs public) ---
|
||||
- job_name: 'wan-dns'
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [dns_udp]
|
||||
scrape_interval: 30s
|
||||
static_configs:
|
||||
- targets:
|
||||
- 192.168.0.250 # Pi-hole
|
||||
- 1.1.1.1
|
||||
- 8.8.8.8
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: wan-monitor.admin-system:9115
|
||||
|
||||
# --- irtt (UDP quality) + iperf3 (throughput) textfile metrics ---
|
||||
- job_name: 'wan-probe'
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 30s
|
||||
fallback_scrape_protocol: PrometheusText0.0.4
|
||||
static_configs:
|
||||
- targets: ['wan-monitor.admin-system:9116']
|
||||
|
||||
# CloudNativePG - Postgres metrics per instance
|
||||
- job_name: 'cloudnativepg'
|
||||
kubernetes_sd_configs:
|
||||
@@ -690,7 +621,7 @@ spec:
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 768Mi
|
||||
memory: 256Mi
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /var/lib/grafana
|
||||
|
||||
@@ -13,7 +13,7 @@ metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
data:
|
||||
|
||||
|
||||
authentik-alerts.yml: |
|
||||
groups:
|
||||
- name: authentik-availability
|
||||
@@ -210,104 +210,3 @@ data:
|
||||
annotations:
|
||||
summary: "Longhorn node {{ $labels.node }} storage pressure"
|
||||
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
|
||||
|
||||
# Add this as a new data key (wan-alerts.yml) in the existing
|
||||
# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
|
||||
# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
|
||||
# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
|
||||
# firing on a single fluke. Recalibrate floors after a week of baseline data.
|
||||
# NOTE: uses Prometheus template funcs (humanize/humanizePercentage/humanizeDuration);
|
||||
# mul/div are NOT valid Prometheus template functions.
|
||||
wan-alerts.yml: |
|
||||
groups:
|
||||
- name: wan-quality-alerts
|
||||
rules:
|
||||
# --- upstream loss: the prime suspect for dropped calls / WireGuard ---
|
||||
- alert: WanUpstreamPacketLoss
|
||||
expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN upstream packet loss to {{ $labels.target }}"
|
||||
description: "irtt upstream loss {{ $value | humanizePercentage }} (>1%) for 2m. Cable-upstream symptom; capture for ISP."
|
||||
|
||||
- alert: WanDownstreamPacketLoss
|
||||
expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN downstream packet loss to {{ $labels.target }}"
|
||||
description: "irtt downstream loss {{ $value | humanizePercentage }} (>1%) for 2m."
|
||||
|
||||
# --- latency / jitter ---
|
||||
- alert: WanLatencyHigh
|
||||
expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN RTT spikes to {{ $labels.target }}"
|
||||
description: "irtt max RTT {{ $value | humanizeDuration }} (>80 ms) for 5m (idle). Real-time apps will feel this."
|
||||
|
||||
- alert: WanJitterHigh
|
||||
expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN jitter high to {{ $labels.target }}"
|
||||
description: "Round-trip jitter {{ $value | humanizeDuration }} (>30 ms) for 5m. Degrades VoIP/video."
|
||||
|
||||
# --- bufferbloat: latency added while the line is saturated ---
|
||||
- alert: WanBufferbloat
|
||||
expr: |
|
||||
(
|
||||
wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
|
||||
- on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
|
||||
) > 0.1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "WAN bufferbloat on {{ $labels.target }}"
|
||||
description: "RTT rises {{ $value | humanizeDuration }} under load (>100 ms). Line buckles when saturated."
|
||||
|
||||
# --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
|
||||
- alert: WanDownloadDegraded
|
||||
expr: wan_throughput_bits_per_second{direction="download"} < 350e6
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN download below half of plan"
|
||||
description: "Download {{ $value | humanize }}bit/s (< 350M, half of 700 normal) for 20m."
|
||||
|
||||
- alert: WanUploadDegraded
|
||||
expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN upload below half of plan"
|
||||
description: "Upload {{ $value | humanize }}bit/s (< 14M, half of 28 normal) for 20m."
|
||||
|
||||
# --- the monitor itself stopped producing data ---
|
||||
- alert: WanProbeStalled
|
||||
expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN probe '{{ $labels.probe }}' stalled"
|
||||
description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
|
||||
|
||||
- alert: WanBlackboxTargetDown
|
||||
expr: probe_success{job=~"wan-.*"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "WAN probe to {{ $labels.instance }} failing"
|
||||
description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."
|
||||
Reference in New Issue
Block a user