diff --git a/admin-system/wan-monitor.yaml b/admin-system/wan-monitor.yaml index c33c459..04bcfa8 100644 --- a/admin-system/wan-monitor.yaml +++ b/admin-system/wan-monitor.yaml @@ -224,7 +224,7 @@ spec: - name: wan-probe # Build + push from Dockerfile.wan-probe (adjust registry/tag to taste) - image: gitea.dooplex.hu/viktor/wan-probe:0.1.0 + image: gitea.dooplex.hu/admin/wan-probe:0.1.0 command: ["/bin/sh", "/scripts/probe-loop.sh"] env: - name: HETZNER_HOST diff --git a/mon-system/prometheus-rules.yaml b/mon-system/prometheus-rules.yaml index 9270ff2..5073da3 100644 --- a/mon-system/prometheus-rules.yaml +++ b/mon-system/prometheus-rules.yaml @@ -13,7 +13,105 @@ metadata: labels: app: prometheus data: +# Add this as a new data key (wan-alerts.yml) in the existing +# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's +# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up. +# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid +# firing on a single fluke. Recalibrate floors after a week of baseline data. + wan-alerts.yml: | + groups: + - name: wan-quality-alerts + rules: + # --- upstream loss: the prime suspect for dropped calls / WireGuard --- + - alert: WanUpstreamPacketLoss + expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: "WAN upstream packet loss to {{ $labels.target }}" + description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP." + - alert: WanDownstreamPacketLoss + expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: "WAN downstream packet loss to {{ $labels.target }}" + description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m." + + # --- latency / jitter --- + - alert: WanLatencyHigh + expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08 + for: 5m + labels: + severity: warning + annotations: + summary: "WAN RTT spikes to {{ $labels.target }}" + description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this." + + - alert: WanJitterHigh + expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03 + for: 5m + labels: + severity: warning + annotations: + summary: "WAN jitter high to {{ $labels.target }}" + description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video." + + # --- bufferbloat: latency added while the line is saturated --- + - alert: WanBufferbloat + expr: | + ( + wan_irtt_rtt_seconds{stat="mean",condition="under_load"} + - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"} + ) > 0.1 + for: 0m + labels: + severity: info + annotations: + summary: "WAN bufferbloat on {{ $labels.target }}" + description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated." + + # --- throughput vs One.hu "normal" 700/28 (alert below 50%) --- + - alert: WanDownloadDegraded + expr: wan_throughput_bits_per_second{direction="download"} < 350e6 + for: 20m + labels: + severity: warning + annotations: + summary: "WAN download below half of plan" + description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m." + + - alert: WanUploadDegraded + expr: wan_throughput_bits_per_second{direction="upload"} < 14e6 + for: 20m + labels: + severity: warning + annotations: + summary: "WAN upload below half of plan" + description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m." + + # --- the monitor itself stopped producing data --- + - alert: WanProbeStalled + expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300 + for: 0m + labels: + severity: warning + annotations: + summary: "WAN probe '{{ $labels.probe }}' stalled" + description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint." + + - alert: WanBlackboxTargetDown + expr: probe_success{job=~"wan-.*"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: "WAN probe to {{ $labels.instance }} failing" + description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m." + authentik-alerts.yml: | groups: - name: authentik-availability