updated configmap
This commit is contained in:
@@ -14,105 +14,6 @@ metadata:
|
|||||||
app: prometheus
|
app: prometheus
|
||||||
data:
|
data:
|
||||||
|
|
||||||
# Add this as a new data key (wan-alerts.yml) in the existing
|
|
||||||
# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
|
|
||||||
# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
|
|
||||||
# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
|
|
||||||
# firing on a single fluke. Recalibrate floors after a week of baseline data.
|
|
||||||
wan-alerts.yml: |
|
|
||||||
groups:
|
|
||||||
- name: wan-quality-alerts
|
|
||||||
rules:
|
|
||||||
# --- upstream loss: the prime suspect for dropped calls / WireGuard ---
|
|
||||||
- alert: WanUpstreamPacketLoss
|
|
||||||
expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN upstream packet loss to {{ $labels.target }}"
|
|
||||||
description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP."
|
|
||||||
|
|
||||||
- alert: WanDownstreamPacketLoss
|
|
||||||
expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN downstream packet loss to {{ $labels.target }}"
|
|
||||||
description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m."
|
|
||||||
|
|
||||||
# --- latency / jitter ---
|
|
||||||
- alert: WanLatencyHigh
|
|
||||||
expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN RTT spikes to {{ $labels.target }}"
|
|
||||||
description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this."
|
|
||||||
|
|
||||||
- alert: WanJitterHigh
|
|
||||||
expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN jitter high to {{ $labels.target }}"
|
|
||||||
description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video."
|
|
||||||
|
|
||||||
# --- bufferbloat: latency added while the line is saturated ---
|
|
||||||
- alert: WanBufferbloat
|
|
||||||
expr: |
|
|
||||||
(
|
|
||||||
wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
|
|
||||||
- on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
|
|
||||||
) > 0.1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: info
|
|
||||||
annotations:
|
|
||||||
summary: "WAN bufferbloat on {{ $labels.target }}"
|
|
||||||
description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated."
|
|
||||||
|
|
||||||
# --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
|
|
||||||
- alert: WanDownloadDegraded
|
|
||||||
expr: wan_throughput_bits_per_second{direction="download"} < 350e6
|
|
||||||
for: 20m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN download below half of plan"
|
|
||||||
description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m."
|
|
||||||
|
|
||||||
- alert: WanUploadDegraded
|
|
||||||
expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
|
|
||||||
for: 20m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN upload below half of plan"
|
|
||||||
description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m."
|
|
||||||
|
|
||||||
# --- the monitor itself stopped producing data ---
|
|
||||||
- alert: WanProbeStalled
|
|
||||||
expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN probe '{{ $labels.probe }}' stalled"
|
|
||||||
description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
|
|
||||||
|
|
||||||
- alert: WanBlackboxTargetDown
|
|
||||||
expr: probe_success{job=~"wan-.*"} == 0
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "WAN probe to {{ $labels.instance }} failing"
|
|
||||||
description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."
|
|
||||||
|
|
||||||
authentik-alerts.yml: |
|
authentik-alerts.yml: |
|
||||||
groups:
|
groups:
|
||||||
- name: authentik-availability
|
- name: authentik-availability
|
||||||
@@ -309,3 +210,102 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Longhorn node {{ $labels.node }} storage pressure"
|
summary: "Longhorn node {{ $labels.node }} storage pressure"
|
||||||
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
|
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
|
||||||
|
|
||||||
|
# Add this as a new data key (wan-alerts.yml) in the existing
|
||||||
|
# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
|
||||||
|
# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
|
||||||
|
# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
|
||||||
|
# firing on a single fluke. Recalibrate floors after a week of baseline data.
|
||||||
|
wan-alerts.yml: |
|
||||||
|
groups:
|
||||||
|
- name: wan-quality-alerts
|
||||||
|
rules:
|
||||||
|
# --- upstream loss: the prime suspect for dropped calls / WireGuard ---
|
||||||
|
- alert: WanUpstreamPacketLoss
|
||||||
|
expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN upstream packet loss to {{ $labels.target }}"
|
||||||
|
description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP."
|
||||||
|
|
||||||
|
- alert: WanDownstreamPacketLoss
|
||||||
|
expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN downstream packet loss to {{ $labels.target }}"
|
||||||
|
description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m."
|
||||||
|
|
||||||
|
# --- latency / jitter ---
|
||||||
|
- alert: WanLatencyHigh
|
||||||
|
expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN RTT spikes to {{ $labels.target }}"
|
||||||
|
description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this."
|
||||||
|
|
||||||
|
- alert: WanJitterHigh
|
||||||
|
expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN jitter high to {{ $labels.target }}"
|
||||||
|
description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video."
|
||||||
|
|
||||||
|
# --- bufferbloat: latency added while the line is saturated ---
|
||||||
|
- alert: WanBufferbloat
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
|
||||||
|
- on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
|
||||||
|
) > 0.1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "WAN bufferbloat on {{ $labels.target }}"
|
||||||
|
description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated."
|
||||||
|
|
||||||
|
# --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
|
||||||
|
- alert: WanDownloadDegraded
|
||||||
|
expr: wan_throughput_bits_per_second{direction="download"} < 350e6
|
||||||
|
for: 20m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN download below half of plan"
|
||||||
|
description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m."
|
||||||
|
|
||||||
|
- alert: WanUploadDegraded
|
||||||
|
expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
|
||||||
|
for: 20m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN upload below half of plan"
|
||||||
|
description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m."
|
||||||
|
|
||||||
|
# --- the monitor itself stopped producing data ---
|
||||||
|
- alert: WanProbeStalled
|
||||||
|
expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN probe '{{ $labels.probe }}' stalled"
|
||||||
|
description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
|
||||||
|
|
||||||
|
- alert: WanBlackboxTargetDown
|
||||||
|
expr: probe_success{job=~"wan-.*"} == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN probe to {{ $labels.instance }} failing"
|
||||||
|
description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."
|
||||||
Reference in New Issue
Block a user