diff --git a/mon-system/prometheus-rules.yaml b/mon-system/prometheus-rules.yaml index 83574d2..9270ff2 100644 --- a/mon-system/prometheus-rules.yaml +++ b/mon-system/prometheus-rules.yaml @@ -13,6 +13,147 @@ metadata: labels: app: prometheus data: + + authentik-alerts.yml: | + groups: + - name: authentik-availability + rules: + # Fires if Prometheus cannot scrape the server metrics endpoint. + # Equivalent to "authentik-server pod is unreachable or crashed." + - alert: AuthentikServerDown + expr: up{job="authentik-server"} == 0 + for: 3m + labels: + severity: critical + component: authentik + annotations: + summary: "Authentik server is unreachable" + description: "authentik-server scrape has been failing for 3+ minutes. Logins will fail." + + - alert: AuthentikWorkerDown + expr: up{job="authentik-worker"} == 0 + for: 3m + labels: + severity: critical + component: authentik + annotations: + summary: "Authentik worker is unreachable" + description: "authentik-worker scrape has been failing for 3+ minutes. Background tasks are not running." + + # Per-outpost disconnect. Downgraded to warning because a single + # outpost failure only breaks a subset of apps (e.g. just the Arr + # stack if arr-outpost is down), not the entire IDP. + - alert: AuthentikOutpostDown + expr: up{job="authentik-outposts"} == 0 + for: 5m + labels: + severity: warning + component: authentik + annotations: + summary: "Authentik outpost {{ $labels.outpost }} is unreachable" + description: "Outpost {{ $labels.outpost }} has been down for 5+ minutes. Apps behind this outpost cannot authenticate." + + - name: authentik-functional + rules: + # *** The 13-days-ago alert. *** + # Fires when the worker has tasks in progress but throughput + # has collapsed. Probes cannot catch this because the pod is + # technically alive — only the queue dynamics give it away. + - alert: AuthentikTaskQueueStuck + expr: | + sum(authentik_tasks_in_progress{job="authentik-worker"}) > 5 + and + sum(rate(authentik_tasks_total{job="authentik-worker"}[5m])) < 0.01 + for: 10m + labels: + severity: critical + component: authentik + annotations: + summary: "Authentik worker queue is stuck" + description: "{{ $value }} tasks in progress with near-zero throughput for 10+ min. Worker is alive but not draining the queue — typically resolved by restarting the deployment." + + # Softer version: queue is growing but still moving somewhat. + # Could indicate an expensive task blocking the workers, or + # a task that keeps retrying, or steady overload. + - alert: AuthentikTaskBacklog + expr: sum(authentik_tasks_in_progress{job="authentik-worker"}) > 20 + for: 15m + labels: + severity: warning + component: authentik + annotations: + summary: "Authentik task backlog >20 for 15 min" + description: "{{ $value }} tasks in progress for 15+ min. Likely overload or a single hanging task." + + # User-visible error signal. Threshold is conservative — + # on a low-traffic homelab, 0.1 err/sec = ~6 errors/min which + # is already noticeable to users. + - alert: AuthentikHighErrorRate + expr: | + sum(rate(django_http_responses_total_by_status_total{job="authentik-server",status=~"5.."}[5m])) > 0.1 + for: 5m + labels: + severity: warning + component: authentik + annotations: + summary: "Authentik is serving 5xx errors" + description: "{{ $value | printf \"%.2f\" }} 5xx responses/sec for 5+ min." + + # p95 latency guard. 2s is high for modern auth flows — at this + # point users are visibly waiting on the login page. + - alert: AuthentikHighLatency + expr: | + histogram_quantile(0.95, + sum by (le) (rate(authentik_main_request_duration_seconds_bucket{job="authentik-server",dest="core"}[5m])) + ) > 2 + for: 10m + labels: + severity: warning + component: authentik + annotations: + summary: "Authentik p95 request latency > 2s" + description: "p95 latency {{ $value | printf \"%.2f\" }}s for 10+ min. Logins are slow." + + postgresql-alerts.yml: | + groups: + - name: postgresql-availability + rules: + - alert: PostgresExporterDown + expr: up{job="cloudnativepg"} == 0 + for: 2m + labels: + severity: critical + component: postgresql + annotations: + summary: "CloudNativePG metrics endpoint unreachable" + description: "CNPG metrics exporter on {{ $labels.pod }} has been down for 2+ min. Postgres may be down or the sidecar may have crashed." + + - name: postgresql-capacity + rules: + # Threshold of 80% of your 200 max_connections = 160. + # If you raise max_connections later, update the number. + - alert: PostgresHighConnections + expr: sum by (cluster) (cnpg_backends_total) > 160 + for: 5m + labels: + severity: warning + component: postgresql + annotations: + summary: "Postgres cluster {{ $labels.cluster }} nearing connection limit" + description: "{{ $value }} active connections (>80% of max_connections=200). Check for connection leaks." + + # Locks held long enough that other queries are waiting. + # Usually cleared fast; a sustained non-zero value is abnormal. + - alert: PostgresBackendsWaiting + expr: cnpg_backends_waiting_total > 0 + for: 5m + labels: + severity: warning + component: postgresql + annotations: + summary: "Postgres has queries blocked on locks" + description: "{{ $value }} backends waiting on locks for 5+ min. Investigate long-running transactions." + longhorn-alerts.yml: | groups: - name: longhorn-volume-alerts