updated configmap

2026-06-07 11:14:48 +02:00
parent 0887848d29
commit 877cda7be1
1 changed files with 99 additions and 99 deletions
@@ -14,105 +14,6 @@ metadata:
    app: prometheus
 data:
 # Add this as a new data key (wan-alerts.yml) in the existing
 # prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
 # "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
 # Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
 # firing on a single fluke. Recalibrate floors after a week of baseline data.
  wan-alerts.yml: |
    groups:
      - name: wan-quality-alerts
        rules:
          # --- upstream loss: the prime suspect for dropped calls / WireGuard ---
          - alert: WanUpstreamPacketLoss
            expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "WAN upstream packet loss to {{ $labels.target }}"
              description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP."
          - alert: WanDownstreamPacketLoss
            expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "WAN downstream packet loss to {{ $labels.target }}"
              description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m."
          # --- latency / jitter ---
          - alert: WanLatencyHigh
            expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "WAN RTT spikes to {{ $labels.target }}"
              description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this."
          - alert: WanJitterHigh
            expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "WAN jitter high to {{ $labels.target }}"
              description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video."
          # --- bufferbloat: latency added while the line is saturated ---
          - alert: WanBufferbloat
            expr: |
              (
                wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
                - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
              ) > 0.1
            for: 0m
            labels:
              severity: info
            annotations:
              summary: "WAN bufferbloat on {{ $labels.target }}"
              description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated."
          # --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
          - alert: WanDownloadDegraded
            expr: wan_throughput_bits_per_second{direction="download"} < 350e6
            for: 20m
            labels:
              severity: warning
            annotations:
              summary: "WAN download below half of plan"
              description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m."
          - alert: WanUploadDegraded
            expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
            for: 20m
            labels:
              severity: warning
            annotations:
              summary: "WAN upload below half of plan"
              description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m."
          # --- the monitor itself stopped producing data ---
          - alert: WanProbeStalled
            expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: "WAN probe '{{ $labels.probe }}' stalled"
              description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
          - alert: WanBlackboxTargetDown
            expr: probe_success{job=~"wan-.*"} == 0
            for: 3m
            labels:
              severity: warning
            annotations:
              summary: "WAN probe to {{ $labels.instance }} failing"
              description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."
  authentik-alerts.yml: |
    groups:
      - name: authentik-availability
@@ -309,3 +210,102 @@ data:
            annotations:
              summary: "Longhorn node {{ $labels.node }} storage pressure"
              description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
 # Add this as a new data key (wan-alerts.yml) in the existing
 # prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
 # "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
 # Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
 # firing on a single fluke. Recalibrate floors after a week of baseline data.
 wan-alerts.yml: |
  groups:
    - name: wan-quality-alerts
      rules:
        # --- upstream loss: the prime suspect for dropped calls / WireGuard ---
        - alert: WanUpstreamPacketLoss
          expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: "WAN upstream packet loss to {{ $labels.target }}"
            description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP."
        - alert: WanDownstreamPacketLoss
          expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: "WAN downstream packet loss to {{ $labels.target }}"
            description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m."
        # --- latency / jitter ---
        - alert: WanLatencyHigh
          expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "WAN RTT spikes to {{ $labels.target }}"
            description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this."
        - alert: WanJitterHigh
          expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "WAN jitter high to {{ $labels.target }}"
            description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video."
        # --- bufferbloat: latency added while the line is saturated ---
        - alert: WanBufferbloat
          expr: |
            (
              wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
              - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
            ) > 0.1
          for: 0m
          labels:
            severity: info
          annotations:
            summary: "WAN bufferbloat on {{ $labels.target }}"
            description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated."
        # --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
        - alert: WanDownloadDegraded
          expr: wan_throughput_bits_per_second{direction="download"} < 350e6
          for: 20m
          labels:
            severity: warning
          annotations:
            summary: "WAN download below half of plan"
            description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m."
        - alert: WanUploadDegraded
          expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
          for: 20m
          labels:
            severity: warning
          annotations:
            summary: "WAN upload below half of plan"
            description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m."
        # --- the monitor itself stopped producing data ---
        - alert: WanProbeStalled
          expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: "WAN probe '{{ $labels.probe }}' stalled"
            description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
        - alert: WanBlackboxTargetDown
          expr: probe_success{job=~"wan-.*"} == 0
          for: 3m
          labels:
            severity: warning
          annotations:
            summary: "WAN probe to {{ $labels.instance }} failing"
            description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."