From 877cda7be1ea45f04939bf1a54336978048dd095 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Sun, 7 Jun 2026 11:14:48 +0200 Subject: [PATCH] updated configmap --- mon-system/prometheus-rules.yaml | 198 +++++++++++++++---------------- 1 file changed, 99 insertions(+), 99 deletions(-) diff --git a/mon-system/prometheus-rules.yaml b/mon-system/prometheus-rules.yaml index 20ac49c..3028b26 100644 --- a/mon-system/prometheus-rules.yaml +++ b/mon-system/prometheus-rules.yaml @@ -13,105 +13,6 @@ metadata: labels: app: prometheus data: - -# Add this as a new data key (wan-alerts.yml) in the existing -# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's -# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up. -# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid -# firing on a single fluke. Recalibrate floors after a week of baseline data. - wan-alerts.yml: | - groups: - - name: wan-quality-alerts - rules: - # --- upstream loss: the prime suspect for dropped calls / WireGuard --- - - alert: WanUpstreamPacketLoss - expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: "WAN upstream packet loss to {{ $labels.target }}" - description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP." - - - alert: WanDownstreamPacketLoss - expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01 - for: 2m - labels: - severity: warning - annotations: - summary: "WAN downstream packet loss to {{ $labels.target }}" - description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m." - - # --- latency / jitter --- - - alert: WanLatencyHigh - expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08 - for: 5m - labels: - severity: warning - annotations: - summary: "WAN RTT spikes to {{ $labels.target }}" - description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this." - - - alert: WanJitterHigh - expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03 - for: 5m - labels: - severity: warning - annotations: - summary: "WAN jitter high to {{ $labels.target }}" - description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video." - - # --- bufferbloat: latency added while the line is saturated --- - - alert: WanBufferbloat - expr: | - ( - wan_irtt_rtt_seconds{stat="mean",condition="under_load"} - - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"} - ) > 0.1 - for: 0m - labels: - severity: info - annotations: - summary: "WAN bufferbloat on {{ $labels.target }}" - description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated." - - # --- throughput vs One.hu "normal" 700/28 (alert below 50%) --- - - alert: WanDownloadDegraded - expr: wan_throughput_bits_per_second{direction="download"} < 350e6 - for: 20m - labels: - severity: warning - annotations: - summary: "WAN download below half of plan" - description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m." - - - alert: WanUploadDegraded - expr: wan_throughput_bits_per_second{direction="upload"} < 14e6 - for: 20m - labels: - severity: warning - annotations: - summary: "WAN upload below half of plan" - description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m." - - # --- the monitor itself stopped producing data --- - - alert: WanProbeStalled - expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300 - for: 0m - labels: - severity: warning - annotations: - summary: "WAN probe '{{ $labels.probe }}' stalled" - description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint." - - - alert: WanBlackboxTargetDown - expr: probe_success{job=~"wan-.*"} == 0 - for: 3m - labels: - severity: warning - annotations: - summary: "WAN probe to {{ $labels.instance }} failing" - description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m." authentik-alerts.yml: | groups: @@ -309,3 +210,102 @@ data: annotations: summary: "Longhorn node {{ $labels.node }} storage pressure" description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%." + +# Add this as a new data key (wan-alerts.yml) in the existing +# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's +# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up. +# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid +# firing on a single fluke. Recalibrate floors after a week of baseline data. +wan-alerts.yml: | + groups: + - name: wan-quality-alerts + rules: + # --- upstream loss: the prime suspect for dropped calls / WireGuard --- + - alert: WanUpstreamPacketLoss + expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: "WAN upstream packet loss to {{ $labels.target }}" + description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP." + + - alert: WanDownstreamPacketLoss + expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01 + for: 2m + labels: + severity: warning + annotations: + summary: "WAN downstream packet loss to {{ $labels.target }}" + description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m." + + # --- latency / jitter --- + - alert: WanLatencyHigh + expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08 + for: 5m + labels: + severity: warning + annotations: + summary: "WAN RTT spikes to {{ $labels.target }}" + description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this." + + - alert: WanJitterHigh + expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03 + for: 5m + labels: + severity: warning + annotations: + summary: "WAN jitter high to {{ $labels.target }}" + description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video." + + # --- bufferbloat: latency added while the line is saturated --- + - alert: WanBufferbloat + expr: | + ( + wan_irtt_rtt_seconds{stat="mean",condition="under_load"} + - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"} + ) > 0.1 + for: 0m + labels: + severity: info + annotations: + summary: "WAN bufferbloat on {{ $labels.target }}" + description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated." + + # --- throughput vs One.hu "normal" 700/28 (alert below 50%) --- + - alert: WanDownloadDegraded + expr: wan_throughput_bits_per_second{direction="download"} < 350e6 + for: 20m + labels: + severity: warning + annotations: + summary: "WAN download below half of plan" + description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m." + + - alert: WanUploadDegraded + expr: wan_throughput_bits_per_second{direction="upload"} < 14e6 + for: 20m + labels: + severity: warning + annotations: + summary: "WAN upload below half of plan" + description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m." + + # --- the monitor itself stopped producing data --- + - alert: WanProbeStalled + expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300 + for: 0m + labels: + severity: warning + annotations: + summary: "WAN probe '{{ $labels.probe }}' stalled" + description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint." + + - alert: WanBlackboxTargetDown + expr: probe_success{job=~"wan-.*"} == 0 + for: 3m + labels: + severity: warning + annotations: + summary: "WAN probe to {{ $labels.instance }} failing" + description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m." \ No newline at end of file