# ============================================================================= # Prometheus Alerting Rules for Longhorn # ============================================================================= # ============================================================================= # Prometheus Alerting Rules for Longhorn # Excludes prometheus-data PVC since it's designed to run at ~95% capacity # ============================================================================= apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: mon-system labels: app: prometheus data: longhorn-alerts.yml: | groups: - name: longhorn-volume-alerts rules: # Critical: Volume at 95% capacity (excluding prometheus-data) - alert: LonghornVolumeSpaceCritical expr: | ( (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) / (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) ) * 100 > 95 for: 5m labels: severity: critical annotations: summary: "Longhorn volume {{ $labels.volume }} is critically full" description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Immediate action required." # Warning: Volume at 85% capacity (excluding prometheus-data) - alert: LonghornVolumeSpaceWarning expr: | ( (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) / (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"})) ) * 100 > 85 for: 10m labels: severity: warning annotations: summary: "Longhorn volume {{ $labels.volume }} is running low on space" description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Consider expanding or cleaning up." # Volume degraded - alert: LonghornVolumeDegraded expr: longhorn_volume_robustness != 1 for: 5m labels: severity: warning annotations: summary: "Longhorn volume {{ $labels.volume }} is degraded" description: "Volume {{ $labels.volume }} robustness is not healthy. Check replica status." # Node storage pressure - alert: LonghornNodeStoragePressure expr: | ( longhorn_node_storage_usage_bytes / longhorn_node_storage_capacity_bytes ) * 100 > 90 for: 10m labels: severity: warning annotations: summary: "Longhorn node {{ $labels.node }} storage pressure" description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."