74 lines
3.1 KiB
YAML
74 lines
3.1 KiB
YAML
# =============================================================================
|
|
# Prometheus Alerting Rules for Longhorn
|
|
# =============================================================================
|
|
|
|
# =============================================================================
|
|
# Prometheus Alerting Rules for Longhorn
|
|
# Excludes prometheus-data PVC since it's designed to run at ~95% capacity
|
|
# =============================================================================
|
|
|
|
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-rules
|
|
namespace: mon-system
|
|
labels:
|
|
app: prometheus
|
|
data:
|
|
longhorn-alerts.yml: |
|
|
groups:
|
|
- name: longhorn-volume-alerts
|
|
rules:
|
|
# Critical: Volume at 95% capacity (excluding prometheus-data)
|
|
- alert: LonghornVolumeSpaceCritical
|
|
expr: |
|
|
(
|
|
(avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
|
|
/
|
|
(avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
|
|
) * 100 > 95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} is critically full"
|
|
description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Immediate action required."
|
|
# Warning: Volume at 85% capacity (excluding prometheus-data)
|
|
- alert: LonghornVolumeSpaceWarning
|
|
expr: |
|
|
(
|
|
(avg by (volume) (longhorn_volume_actual_size_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
|
|
/
|
|
(avg by (volume) (longhorn_volume_capacity_bytes{volume!="pvc-81912547-5a18-4410-ac27-9c15251483a8"}))
|
|
) * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} is running low on space"
|
|
description: "Volume {{ $labels.volume }} is at {{ printf \"%.1f\" $value }}% capacity. Consider expanding or cleaning up."
|
|
# Volume degraded
|
|
- alert: LonghornVolumeDegraded
|
|
expr: longhorn_volume_robustness != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn volume {{ $labels.volume }} is degraded"
|
|
description: "Volume {{ $labels.volume }} robustness is not healthy. Check replica status."
|
|
|
|
# Node storage pressure
|
|
- alert: LonghornNodeStoragePressure
|
|
expr: |
|
|
(
|
|
longhorn_node_storage_usage_bytes
|
|
/
|
|
longhorn_node_storage_capacity_bytes
|
|
) * 100 > 90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Longhorn node {{ $labels.node }} storage pressure"
|
|
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
|