added authentik and postgres alerts
This commit is contained in:
@@ -13,6 +13,147 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: prometheus
|
app: prometheus
|
||||||
data:
|
data:
|
||||||
|
|
||||||
|
authentik-alerts.yml: |
|
||||||
|
groups:
|
||||||
|
- name: authentik-availability
|
||||||
|
rules:
|
||||||
|
# Fires if Prometheus cannot scrape the server metrics endpoint.
|
||||||
|
# Equivalent to "authentik-server pod is unreachable or crashed."
|
||||||
|
- alert: AuthentikServerDown
|
||||||
|
expr: up{job="authentik-server"} == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik server is unreachable"
|
||||||
|
description: "authentik-server scrape has been failing for 3+ minutes. Logins will fail."
|
||||||
|
|
||||||
|
- alert: AuthentikWorkerDown
|
||||||
|
expr: up{job="authentik-worker"} == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik worker is unreachable"
|
||||||
|
description: "authentik-worker scrape has been failing for 3+ minutes. Background tasks are not running."
|
||||||
|
|
||||||
|
# Per-outpost disconnect. Downgraded to warning because a single
|
||||||
|
# outpost failure only breaks a subset of apps (e.g. just the Arr
|
||||||
|
# stack if arr-outpost is down), not the entire IDP.
|
||||||
|
- alert: AuthentikOutpostDown
|
||||||
|
expr: up{job="authentik-outposts"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik outpost {{ $labels.outpost }} is unreachable"
|
||||||
|
description: "Outpost {{ $labels.outpost }} has been down for 5+ minutes. Apps behind this outpost cannot authenticate."
|
||||||
|
|
||||||
|
- name: authentik-functional
|
||||||
|
rules:
|
||||||
|
# *** The 13-days-ago alert. ***
|
||||||
|
# Fires when the worker has tasks in progress but throughput
|
||||||
|
# has collapsed. Probes cannot catch this because the pod is
|
||||||
|
# technically alive — only the queue dynamics give it away.
|
||||||
|
- alert: AuthentikTaskQueueStuck
|
||||||
|
expr: |
|
||||||
|
sum(authentik_tasks_in_progress{job="authentik-worker"}) > 5
|
||||||
|
and
|
||||||
|
sum(rate(authentik_tasks_total{job="authentik-worker"}[5m])) < 0.01
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik worker queue is stuck"
|
||||||
|
description: "{{ $value }} tasks in progress with near-zero throughput for 10+ min. Worker is alive but not draining the queue — typically resolved by restarting the deployment."
|
||||||
|
|
||||||
|
# Softer version: queue is growing but still moving somewhat.
|
||||||
|
# Could indicate an expensive task blocking the workers, or
|
||||||
|
# a task that keeps retrying, or steady overload.
|
||||||
|
- alert: AuthentikTaskBacklog
|
||||||
|
expr: sum(authentik_tasks_in_progress{job="authentik-worker"}) > 20
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik task backlog >20 for 15 min"
|
||||||
|
description: "{{ $value }} tasks in progress for 15+ min. Likely overload or a single hanging task."
|
||||||
|
|
||||||
|
# User-visible error signal. Threshold is conservative —
|
||||||
|
# on a low-traffic homelab, 0.1 err/sec = ~6 errors/min which
|
||||||
|
# is already noticeable to users.
|
||||||
|
- alert: AuthentikHighErrorRate
|
||||||
|
expr: |
|
||||||
|
sum(rate(django_http_responses_total_by_status_total{job="authentik-server",status=~"5.."}[5m])) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik is serving 5xx errors"
|
||||||
|
description: "{{ $value | printf \"%.2f\" }} 5xx responses/sec for 5+ min."
|
||||||
|
|
||||||
|
# p95 latency guard. 2s is high for modern auth flows — at this
|
||||||
|
# point users are visibly waiting on the login page.
|
||||||
|
- alert: AuthentikHighLatency
|
||||||
|
expr: |
|
||||||
|
histogram_quantile(0.95,
|
||||||
|
sum by (le) (rate(authentik_main_request_duration_seconds_bucket{job="authentik-server",dest="core"}[5m]))
|
||||||
|
) > 2
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: authentik
|
||||||
|
annotations:
|
||||||
|
summary: "Authentik p95 request latency > 2s"
|
||||||
|
description: "p95 latency {{ $value | printf \"%.2f\" }}s for 10+ min. Logins are slow."
|
||||||
|
|
||||||
|
postgresql-alerts.yml: |
|
||||||
|
groups:
|
||||||
|
- name: postgresql-availability
|
||||||
|
rules:
|
||||||
|
- alert: PostgresExporterDown
|
||||||
|
expr: up{job="cloudnativepg"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
component: postgresql
|
||||||
|
annotations:
|
||||||
|
summary: "CloudNativePG metrics endpoint unreachable"
|
||||||
|
description: "CNPG metrics exporter on {{ $labels.pod }} has been down for 2+ min. Postgres may be down or the sidecar may have crashed."
|
||||||
|
|
||||||
|
- name: postgresql-capacity
|
||||||
|
rules:
|
||||||
|
# Threshold of 80% of your 200 max_connections = 160.
|
||||||
|
# If you raise max_connections later, update the number.
|
||||||
|
- alert: PostgresHighConnections
|
||||||
|
expr: sum by (cluster) (cnpg_backends_total) > 160
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: postgresql
|
||||||
|
annotations:
|
||||||
|
summary: "Postgres cluster {{ $labels.cluster }} nearing connection limit"
|
||||||
|
description: "{{ $value }} active connections (>80% of max_connections=200). Check for connection leaks."
|
||||||
|
|
||||||
|
# Locks held long enough that other queries are waiting.
|
||||||
|
# Usually cleared fast; a sustained non-zero value is abnormal.
|
||||||
|
- alert: PostgresBackendsWaiting
|
||||||
|
expr: cnpg_backends_waiting_total > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
component: postgresql
|
||||||
|
annotations:
|
||||||
|
summary: "Postgres has queries blocked on locks"
|
||||||
|
description: "{{ $value }} backends waiting on locks for 5+ min. Investigate long-running transactions."
|
||||||
|
|
||||||
longhorn-alerts.yml: |
|
longhorn-alerts.yml: |
|
||||||
groups:
|
groups:
|
||||||
- name: longhorn-volume-alerts
|
- name: longhorn-volume-alerts
|
||||||
|
|||||||
Reference in New Issue
Block a user