# ============================================================================= # Prometheus Alerting Rules for Longhorn # ============================================================================= # ============================================================================= # Prometheus Alerting Rules for Longhorn # Excludes prometheus-data PVC since it's designed to run at ~95% capacity # ============================================================================= apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: mon-system labels: app: prometheus data: authentik-alerts.yml: | groups: - name: authentik-availability rules: # Fires if Prometheus cannot scrape the server metrics endpoint. # Equivalent to "authentik-server pod is unreachable or crashed." - alert: AuthentikServerDown expr: up{job="authentik-server"} == 0 for: 3m labels: severity: critical component: authentik annotations: summary: "Authentik server is unreachable" description: "authentik-server scrape has been failing for 3+ minutes. Logins will fail." - alert: AuthentikWorkerDown expr: up{job="authentik-worker"} == 0 for: 3m labels: severity: critical component: authentik annotations: summary: "Authentik worker is unreachable" description: "authentik-worker scrape has been failing for 3+ minutes. Background tasks are not running." # Per-outpost disconnect. Downgraded to warning because a single # outpost failure only breaks a subset of apps (e.g. just the Arr # stack if arr-outpost is down), not the entire IDP. - alert: AuthentikOutpostDown expr: up{job="authentik-outposts"} == 0 for: 5m labels: severity: warning component: authentik annotations: summary: "Authentik outpost {{ $labels.outpost }} is unreachable" description: "Outpost {{ $labels.outpost }} has been down for 5+ minutes. Apps behind this outpost cannot authenticate." - name: authentik-functional rules: # *** The 13-days-ago alert. *** # Fires when the worker has tasks in progress but throughput # has collapsed. Probes cannot catch this because the pod is # technically alive — only the queue dynamics give it away. - alert: AuthentikTaskQueueStuck expr: | sum(authentik_tasks_in_progress{job="authentik-worker"}) > 5 and sum(rate(authentik_tasks_total{job="authentik-worker"}[5m])) < 0.01 for: 10m labels: severity: critical component: authentik annotations: summary: "Authentik worker queue is stuck" description: "{{ $value }} tasks in progress with near-zero throughput for 10+ min. Worker is alive but not draining the queue — typically resolved by restarting the deployment." # Softer version: queue is growing but still moving somewhat. # Could indicate an expensive task blocking the workers, or # a task that keeps retrying, or steady overload. - alert: AuthentikTaskBacklog expr: sum(authentik_tasks_in_progress{job="authentik-worker"}) > 20 for: 15m labels: severity: warning component: authentik annotations: summary: "Authentik task backlog >20 for 15 min" description: "{{ $value }} tasks in progress for 15+ min. Likely overload or a single hanging task." # User-visible error signal. Threshold is conservative — # on a low-traffic homelab, 0.1 err/sec = ~6 errors/min which # is already noticeable to users. - alert: AuthentikHighErrorRate expr: | sum(rate(django_http_responses_total_by_status_total{job="authentik-server",status=~"5.."}[5m])) > 0.1 for: 5m labels: severity: warning component: authentik annotations: summary: "Authentik is serving 5xx errors" description: "{{ $value | printf \"%.2f\" }} 5xx responses/sec for 5+ min." # p95 latency guard. 2s is high for modern auth flows — at this # point users are visibly waiting on the login page. - alert: AuthentikHighLatency expr: | histogram_quantile(0.95, sum by (le) (rate(authentik_main_request_duration_seconds_bucket{job="authentik-server",dest="core"}[5m])) ) > 2 for: 10m labels: severity: warning component: authentik annotations: summary: "Authentik p95 request latency > 2s" description: "p95 latency {{ $value | printf \"%.2f\" }}s for 10+ min. Logins are slow." postgresql-alerts.yml: | groups: - name: postgresql-availability rules: - alert: PostgresExporterDown expr: up{job="cloudnativepg"} == 0 for: 2m labels: severity: critical component: postgresql annotations: summary: "CloudNativePG metrics endpoint unreachable" description: "CNPG metrics exporter on {{ $labels.pod }} has been down for 2+ min. Postgres may be down or the sidecar may have crashed." - name: postgresql-capacity rules: # Threshold of 80% of your 200 max_connections = 160. # If you raise max_connections later, update the number. - alert: PostgresHighConnections expr: sum by (cluster) (cnpg_backends_total) > 160 for: 5m labels: severity: warning component: postgresql annotations: summary: "Postgres cluster {{ $labels.cluster }} nearing connection limit" description: "{{ $value }} active connections (>80% of max_connections=200). Check for connection leaks." # Locks held long enough that other queries are waiting. # Usually cleared fast; a sustained non-zero value is abnormal. - alert: PostgresBackendsWaiting expr: cnpg_backends_waiting_total > 0 for: 5m labels: severity: warning component: postgresql annotations: summary: "Postgres has queries blocked on locks" description: "{{ $value }} backends waiting on locks for 5+ min. Investigate long-running transactions." longhorn-alerts.yml: | groups: - name: longhorn-volume-alerts rules: # Critical: Volume at 95% capacity (excluding prometheus-data) - alert: LonghornVolumeSpaceCritical expr: | ( (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) / (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) ) * 100 > 95 for: 5m labels: severity: critical annotations: summary: "Longhorn volume {{ $labels.volume }} is critically full" description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Immediate action required." # Warning: Volume at 85% capacity (excluding prometheus-data) - alert: LonghornVolumeSpaceWarning expr: | ( (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) / (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) ) * 100 > 85 for: 10m labels: severity: warning annotations: summary: "Longhorn volume {{ $labels.volume }} is running low on space" description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Consider expanding or cleaning up." # Volume degraded - alert: LonghornVolumeDegraded expr: longhorn_volume_robustness != 1 for: 5m labels: severity: warning annotations: summary: "Longhorn volume {{ $labels.volume }} is degraded" description: "Volume {{ $labels.volume }} robustness is not healthy. Check replica status." # Node storage pressure - alert: LonghornNodeStoragePressure expr: | ( longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes ) * 100 > 90 for: 10m labels: severity: warning annotations: summary: "Longhorn node {{ $labels.node }} storage pressure" description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%." # Add this as a new data key (wan-alerts.yml) in the existing # prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's # "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up. # Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid # firing on a single fluke. Recalibrate floors after a week of baseline data. wan-alerts.yml: | groups: - name: wan-quality-alerts rules: # --- upstream loss: the prime suspect for dropped calls / WireGuard --- - alert: WanUpstreamPacketLoss expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01 for: 2m labels: severity: warning annotations: summary: "WAN upstream packet loss to {{ $labels.target }}" description: "irtt upstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m. Cable-upstream symptom; capture for ISP." - alert: WanDownstreamPacketLoss expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01 for: 2m labels: severity: warning annotations: summary: "WAN downstream packet loss to {{ $labels.target }}" description: "irtt downstream loss {{ printf \"%.2f\" (mul $value 100) }}% (>1%) for 2m." # --- latency / jitter --- - alert: WanLatencyHigh expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08 for: 5m labels: severity: warning annotations: summary: "WAN RTT spikes to {{ $labels.target }}" description: "irtt max RTT > 80 ms for 5m (idle). Real-time apps will feel this." - alert: WanJitterHigh expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03 for: 5m labels: severity: warning annotations: summary: "WAN jitter high to {{ $labels.target }}" description: "Round-trip jitter > 30 ms for 5m. Degrades VoIP/video." # --- bufferbloat: latency added while the line is saturated --- - alert: WanBufferbloat expr: | ( wan_irtt_rtt_seconds{stat="mean",condition="under_load"} - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"} ) > 0.1 for: 0m labels: severity: info annotations: summary: "WAN bufferbloat on {{ $labels.target }}" description: "RTT rises {{ printf \"%.0f\" (mul $value 1000) }} ms under load (>100 ms). Line buckles when saturated." # --- throughput vs One.hu "normal" 700/28 (alert below 50%) --- - alert: WanDownloadDegraded expr: wan_throughput_bits_per_second{direction="download"} < 350e6 for: 20m labels: severity: warning annotations: summary: "WAN download below half of plan" description: "Download {{ printf \"%.0f\" (div $value 1e6) }} Mbit/s (< 350, half of 700 normal) for 20m." - alert: WanUploadDegraded expr: wan_throughput_bits_per_second{direction="upload"} < 14e6 for: 20m labels: severity: warning annotations: summary: "WAN upload below half of plan" description: "Upload {{ printf \"%.1f\" (div $value 1e6) }} Mbit/s (< 14, half of 28 normal) for 20m." # --- the monitor itself stopped producing data --- - alert: WanProbeStalled expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300 for: 0m labels: severity: warning annotations: summary: "WAN probe '{{ $labels.probe }}' stalled" description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint." - alert: WanBlackboxTargetDown expr: probe_success{job=~"wan-.*"} == 0 for: 3m labels: severity: warning annotations: summary: "WAN probe to {{ $labels.instance }} failing" description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."