homelab-manifests/mon-system/prometheus-rules.yaml

# =============================================================================
# Prometheus Alerting Rules for Longhorn
# =============================================================================
# =============================================================================
# Prometheus Alerting Rules for Longhorn
# Excludes prometheus-data PVC since it's designed to run at ~95% capacity
# =============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-rules
  namespace: mon-system
  labels:
    app: prometheus
data:

  authentik-alerts.yml: |
    groups:
      - name: authentik-availability
        rules:
          # Fires if Prometheus cannot scrape the server metrics endpoint.
          # Equivalent to "authentik-server pod is unreachable or crashed."
          - alert: AuthentikServerDown
            expr: up{job="authentik-server"} == 0
            for: 3m
            labels:
              severity: critical
              component: authentik
            annotations:
              summary: "Authentik server is unreachable"
              description: "authentik-server scrape has been failing for 3+ minutes. Logins will fail."

          - alert: AuthentikWorkerDown
            expr: up{job="authentik-worker"} == 0
            for: 3m
            labels:
              severity: critical
              component: authentik
            annotations:
              summary: "Authentik worker is unreachable"
              description: "authentik-worker scrape has been failing for 3+ minutes. Background tasks are not running."

          # Per-outpost disconnect. Downgraded to warning because a single
          # outpost failure only breaks a subset of apps (e.g. just the Arr
          # stack if arr-outpost is down), not the entire IDP.
          - alert: AuthentikOutpostDown
            expr: up{job="authentik-outposts"} == 0
            for: 5m
            labels:
              severity: warning
              component: authentik
            annotations:
              summary: "Authentik outpost {{ $labels.outpost }} is unreachable"
              description: "Outpost {{ $labels.outpost }} has been down for 5+ minutes. Apps behind this outpost cannot authenticate."

      - name: authentik-functional
        rules:
          # *** The 13-days-ago alert. ***
          # Fires when the worker has tasks in progress but throughput
          # has collapsed. Probes cannot catch this because the pod is
          # technically alive — only the queue dynamics give it away.
          - alert: AuthentikTaskQueueStuck
            expr: |
              sum(authentik_tasks_in_progress{job="authentik-worker"}) > 5
              and
              sum(rate(authentik_tasks_total{job="authentik-worker"}[5m])) < 0.01
            for: 10m
            labels:
              severity: critical
              component: authentik
            annotations:
              summary: "Authentik worker queue is stuck"
              description: "{{ $value }} tasks in progress with near-zero throughput for 10+ min. Worker is alive but not draining the queue — typically resolved by restarting the deployment."

          # Softer version: queue is growing but still moving somewhat.
          # Could indicate an expensive task blocking the workers, or
          # a task that keeps retrying, or steady overload.
          - alert: AuthentikTaskBacklog
            expr: sum(authentik_tasks_in_progress{job="authentik-worker"}) > 20
            for: 15m
            labels:
              severity: warning
              component: authentik
            annotations:
              summary: "Authentik task backlog >20 for 15 min"
              description: "{{ $value }} tasks in progress for 15+ min. Likely overload or a single hanging task."

          # User-visible error signal. Threshold is conservative —
          # on a low-traffic homelab, 0.1 err/sec = ~6 errors/min which
          # is already noticeable to users.
          - alert: AuthentikHighErrorRate
            expr: |
              sum(rate(django_http_responses_total_by_status_total{job="authentik-server",status=~"5.."}[5m])) > 0.1
            for: 5m
            labels:
              severity: warning
              component: authentik
            annotations:
              summary: "Authentik is serving 5xx errors"
              description: "{{ $value | printf \"%.2f\" }} 5xx responses/sec for 5+ min."

          # p95 latency guard. 2s is high for modern auth flows — at this
          # point users are visibly waiting on the login page.
          - alert: AuthentikHighLatency
            expr: |
              histogram_quantile(0.95,
                sum by (le) (rate(authentik_main_request_duration_seconds_bucket{job="authentik-server",dest="core"}[5m]))
              ) > 2
            for: 10m
            labels:
              severity: warning
              component: authentik
            annotations:
              summary: "Authentik p95 request latency > 2s"
              description: "p95 latency {{ $value | printf \"%.2f\" }}s for 10+ min. Logins are slow."

  postgresql-alerts.yml: |
    groups:
      - name: postgresql-availability
        rules:
          - alert: PostgresExporterDown
            expr: up{job="cloudnativepg"} == 0
            for: 2m
            labels:
              severity: critical
              component: postgresql
            annotations:
              summary: "CloudNativePG metrics endpoint unreachable"
              description: "CNPG metrics exporter on {{ $labels.pod }} has been down for 2+ min. Postgres may be down or the sidecar may have crashed."

      - name: postgresql-capacity
        rules:
          # Threshold of 80% of your 200 max_connections = 160.
          # If you raise max_connections later, update the number.
          - alert: PostgresHighConnections
            expr: sum by (cluster) (cnpg_backends_total) > 160
            for: 5m
            labels:
              severity: warning
              component: postgresql
            annotations:
              summary: "Postgres cluster {{ $labels.cluster }} nearing connection limit"
              description: "{{ $value }} active connections (>80% of max_connections=200). Check for connection leaks."

          # Locks held long enough that other queries are waiting.
          # Usually cleared fast; a sustained non-zero value is abnormal.
          - alert: PostgresBackendsWaiting
            expr: cnpg_backends_waiting_total > 0
            for: 5m
            labels:
              severity: warning
              component: postgresql
            annotations:
              summary: "Postgres has queries blocked on locks"
              description: "{{ $value }} backends waiting on locks for 5+ min. Investigate long-running transactions."

  longhorn-alerts.yml: |
    groups:
      - name: longhorn-volume-alerts
        rules:
          # Critical: Volume at 95% capacity (excluding prometheus-data)
          - alert: LonghornVolumeSpaceCritical
            expr: |
              (
                (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
                /
                (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
              ) * 100 > 95
            for: 5m
            labels:
              severity: critical
            annotations:
              summary: "Longhorn volume {{ $labels.volume }} is critically full"
              description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Immediate action required."
          # Warning: Volume at 85% capacity (excluding prometheus-data)
          - alert: LonghornVolumeSpaceWarning
            expr: |
              (
                (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
                /
                (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
              ) * 100 > 85
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "Longhorn volume {{ $labels.volume }} is running low on space"
              description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Consider expanding or cleaning up."
          # Volume degraded
          - alert: LonghornVolumeDegraded
            expr: longhorn_volume_robustness != 1
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "Longhorn volume {{ $labels.volume }} is degraded"
              description: "Volume {{ $labels.volume }} robustness is not healthy. Check replica status."

          # Node storage pressure
          - alert: LonghornNodeStoragePressure
            expr: |
              (
                longhorn_node_storage_usage_bytes
                /
                longhorn_node_storage_capacity_bytes
              ) * 100 > 90
            for: 10m
            labels:
              severity: warning
            annotations:
              summary: "Longhorn node {{ $labels.node }} storage pressure"
              description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."

  # Add this as a new data key (wan-alerts.yml) in the existing
  # prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
  # "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
  # Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
  # firing on a single fluke. Recalibrate floors after a week of baseline data.
  # NOTE: uses Prometheus template funcs (humanize/humanizePercentage/humanizeDuration);
  # mul/div are NOT valid Prometheus template functions.
  wan-alerts.yml: |
    groups:
      - name: wan-quality-alerts
        rules:
          # --- upstream loss: the prime suspect for dropped calls / WireGuard ---
          - alert: WanUpstreamPacketLoss
            expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "WAN upstream packet loss to {{ $labels.target }}"
              description: "irtt upstream loss {{ $value | humanizePercentage }} (>1%) for 2m. Cable-upstream symptom; capture for ISP."

          - alert: WanDownstreamPacketLoss
            expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
            for: 2m
            labels:
              severity: warning
            annotations:
              summary: "WAN downstream packet loss to {{ $labels.target }}"
              description: "irtt downstream loss {{ $value | humanizePercentage }} (>1%) for 2m."

          # --- latency / jitter ---
          - alert: WanLatencyHigh
            expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "WAN RTT spikes to {{ $labels.target }}"
              description: "irtt max RTT {{ $value | humanizeDuration }} (>80 ms) for 5m (idle). Real-time apps will feel this."

          - alert: WanJitterHigh
            expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
            for: 5m
            labels:
              severity: warning
            annotations:
              summary: "WAN jitter high to {{ $labels.target }}"
              description: "Round-trip jitter {{ $value | humanizeDuration }} (>30 ms) for 5m. Degrades VoIP/video."

          # --- bufferbloat: latency added while the line is saturated ---
          - alert: WanBufferbloat
            expr: |
              (
                wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
                - on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
              ) > 0.1
            for: 0m
            labels:
              severity: info
            annotations:
              summary: "WAN bufferbloat on {{ $labels.target }}"
              description: "RTT rises {{ $value | humanizeDuration }} under load (>100 ms). Line buckles when saturated."

          # --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
          - alert: WanDownloadDegraded
            expr: wan_throughput_bits_per_second{direction="download"} < 350e6
            for: 20m
            labels:
              severity: warning
            annotations:
              summary: "WAN download below half of plan"
              description: "Download {{ $value | humanize }}bit/s (< 350M, half of 700 normal) for 20m."

          - alert: WanUploadDegraded
            expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
            for: 20m
            labels:
              severity: warning
            annotations:
              summary: "WAN upload below half of plan"
              description: "Upload {{ $value | humanize }}bit/s (< 14M, half of 28 normal) for 20m."

          # --- the monitor itself stopped producing data ---
          - alert: WanProbeStalled
            expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
            for: 0m
            labels:
              severity: warning
            annotations:
              summary: "WAN probe '{{ $labels.probe }}' stalled"
              description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."

          - alert: WanBlackboxTargetDown
            expr: probe_success{job=~"wan-.*"} == 0
            for: 3m
            labels:
              severity: warning
            annotations:
              summary: "WAN probe to {{ $labels.instance }} failing"
              description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."