diff --git a/mon-system/prometheus-rules.yaml b/mon-system/prometheus-rules.yaml new file mode 100644 index 0000000..262c9dd --- /dev/null +++ b/mon-system/prometheus-rules.yaml @@ -0,0 +1,73 @@ +# ============================================================================= +# Prometheus Alerting Rules for Longhorn +# ============================================================================= + +# ============================================================================= +# Prometheus Alerting Rules for Longhorn +# Excludes prometheus-data PVC since it's designed to run at ~95% capacity +# ============================================================================= + +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-rules + namespace: mon-system + labels: + app: prometheus +data: + longhorn-alerts.yml: | + groups: + - name: longhorn-volume-alerts + rules: + # Critical: Volume at 95% capacity (excluding prometheus-data) + - alert: LonghornVolumeSpaceCritical + expr: | + ( + (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-6c6f1864-de15-4f10-9d73-8fbb678c391f"})) + / + (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-6c6f1864-de15-4f10-9d73-8fbb678c391f"})) + ) * 100 > 95 + for: 5m + labels: + severity: critical + annotations: + summary: "Longhorn volume {{ $labels.volume }} is critically full" + description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Immediate action required." + # Warning: Volume at 85% capacity (excluding prometheus-data) + - alert: LonghornVolumeSpaceWarning + expr: | + ( + (avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-6c6f1864-de15-4f10-9d73-8fbb678c391f"})) + / + (avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-6c6f1864-de15-4f10-9d73-8fbb678c391f"})) + ) * 100 > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "Longhorn volume {{ $labels.volume }} is running low on space" + description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Consider expanding or cleaning up." + # Volume degraded + - alert: LonghornVolumeDegraded + expr: longhorn_volume_robustness != 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Longhorn volume {{ $labels.volume }} is degraded" + description: "Volume {{ $labels.volume }} robustness is not healthy. Check replica status." + + # Node storage pressure + - alert: LonghornNodeStoragePressure + expr: | + ( + longhorn_node_storage_usage_bytes + / + longhorn_node_storage_capacity_bytes + ) * 100 > 90 + for: 10m + labels: + severity: warning + annotations: + summary: "Longhorn node {{ $labels.node }} storage pressure" + description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."