Files
homelab-manifests/mon-system/prometheus-rules.yaml
T
2026-01-07 15:59:09 +01:00

72 lines
3.1 KiB
YAML

# =============================================================================
# Prometheus Alerting Rules for Longhorn
# =============================================================================
# =============================================================================
# Prometheus Alerting Rules for Longhorn
# Excludes prometheus-data PVC since it's designed to run at ~95% capacity
# =============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: mon-system
labels:
app: prometheus
data:
longhorn-alerts.yml: |
groups:
- name: longhorn-volume-alerts
rules:
# Critical: Volume at 95% capacity (excluding prometheus-data)
- alert: LonghornVolumeSpaceCritical
expr: |
(
(avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
/
(avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
) * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "Longhorn volume {{ $labels.volume }} is critically full"
description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Immediate action required."
# Warning: Volume at 85% capacity (excluding prometheus-data)
- alert: LonghornVolumeSpaceWarning
expr: |
(
(avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
/
(avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
) * 100 > 85
for: 10m
labels:
severity: warning
annotations:
summary: "Longhorn volume {{ $labels.volume }} is running low on space"
description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Consider expanding or cleaning up."
# Volume degraded
- alert: LonghornVolumeDegraded
expr: longhorn_volume_robustness != 1
for: 5m
labels:
severity: warning
annotations:
summary: "Longhorn volume {{ $labels.volume }} is degraded"
description: "Volume {{ $labels.volume }} robustness is not healthy. Check replica status."
# Node storage pressure
- alert: LonghornNodeStoragePressure
expr: |
(
longhorn_node_storage_usage_bytes
/
longhorn_node_storage_capacity_bytes
) * 100 > 90
for: 10m
labels:
severity: warning
annotations:
summary: "Longhorn node {{ $labels.node }} storage pressure"
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."