Files
homelab-manifests/mon-system/monitoring.yaml
T
2026-01-07 15:59:09 +01:00

854 lines
22 KiB
YAML

# =============================================================================
# MONITORING STACK - Prometheus + Grafana
# Namespace: mon-system
# =============================================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: mon-system
labels:
name: mon-system
# =============================================================================
# PROMETHEUS CONFIGURATION
# =============================================================================
---
# =============================================================================
# Updated Prometheus ConfigMap with Alerting Configuration
# =============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: mon-system
labels:
app: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_timeout: 10s
# Alertmanager configuration - THIS WAS MISSING!
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load alerting rules
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Node Exporter - Host metrics
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'dooplex'
# Kube-state-metrics - Kubernetes object metrics
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics:8080']
# Nginx Ingress Controller
- job_name: 'nginx-ingress'
static_configs:
- targets: ['ingress-nginx-internal-controller-metrics.nginx-system:10254']
# Kubernetes API server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
metric_relabel_configs:
# Drop high-cardinality histogram buckets
- source_labels: [__name__]
action: drop
regex: '(apiserver|etcd|workqueue)_.+_bucket'
# Kubernetes nodes (kubelet metrics)
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
metric_relabel_configs:
# Drop histogram buckets from kubelet too
- source_labels: [__name__]
action: drop
regex: '.+_bucket'
# Kubernetes nodes cadvisor (container metrics)
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: /metrics/cadvisor
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Pi-hole
- job_name: 'pihole'
scrape_interval: 60s
static_configs:
- targets: ['pihole-exporter.pihole-system:9617']
# Kubernetes services - EXCLUDE services that have dedicated jobs
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
# DROP services that already have dedicated scrape jobs
- source_labels: [__meta_kubernetes_service_name]
action: drop
regex: (node-exporter|kube-state-metrics|prometheus|ingress-nginx-internal-controller-metrics)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: service
# Servarr stack metrics - 60s interval (less critical)
- job_name: 'sonarr'
scrape_interval: 60s
static_configs:
- targets: ['exportarr-sonarr:9707']
- job_name: 'radarr'
scrape_interval: 60s
static_configs:
- targets: ['exportarr-radarr:9708']
- job_name: 'prowlarr'
scrape_interval: 60s
static_configs:
- targets: ['exportarr-prowlarr:9709']
# Longhorn metrics - 60s interval
- job_name: 'longhorn'
scrape_interval: 60s
static_configs:
- targets: ['longhorn-backend.longhorn-system:9500']
# cert-manager metrics - 60s interval
- job_name: 'cert-manager'
scrape_interval: 60s
static_configs:
- targets: ['cert-manager.cert-manager:9402']
# Alertmanager metrics
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: mon-system
labels:
app: prometheus
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 20Gi
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: mon-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions", "networking.k8s.io"]
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: mon-system
---
# =============================================================================
# Updated Prometheus Deployment with Rules Volume
# =============================================================================
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: prometheus
name: prometheus
namespace: mon-system
spec:
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: prometheus
strategy:
type: Recreate
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:v3.8.0
args:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=60d
- --storage.tsdb.retention.size=19GB
- --web.enable-lifecycle
- --web.enable-admin-api
ports:
- containerPort: 9090
name: http
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: '2'
memory: 6Gi
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: rules
mountPath: /etc/prometheus/rules
- name: data
mountPath: /prometheus
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: prometheus
volumes:
- name: config
configMap:
name: prometheus-config
- name: rules
configMap:
name: prometheus-rules
- name: data
persistentVolumeClaim:
claimName: prometheus-data
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: mon-system
labels:
app: prometheus
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
name: http
selector:
app: prometheus
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: mon-system
annotations:
external-dns.alpha.kubernetes.io/hostname: prometheus.home
spec:
ingressClassName: nginx-internal
rules:
- host: prometheus.home
http:
paths:
- backend:
service:
name: prometheus
port:
number: 9090
path: /
pathType: Prefix
# =============================================================================
# GRAFANA CONFIGURATION
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: mon-system
labels:
app: grafana
data:
datasources.yaml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards-config
namespace: mon-system
labels:
app: grafana
data:
dashboards.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: mon-system
labels:
app: grafana
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: mon-system
labels:
app: grafana
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472
runAsUser: 472
runAsGroup: 472
containers:
- name: grafana
image: grafana/grafana:12.3.0
ports:
- containerPort: 3000
name: http
env:
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-credentials
key: admin-user
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-credentials
key: admin-password
- name: GF_SERVER_ROOT_URL
value: "https://grafana.dooplex.hu"
- name: GF_AUTH_GENERIC_OAUTH_ENABLED
value: "true"
- name: GF_AUTH_GENERIC_OAUTH_NAME
value: "authentik"
- name: GF_AUTH_GENERIC_OAUTH_CLIENT_ID
valueFrom:
secretKeyRef:
name: grafana-oauth
key: client-id
- name: GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: grafana-oauth
key: client-secret
- name: GF_AUTH_GENERIC_OAUTH_SCOPES
value: "openid profile email grafana-admin-role"
- name: GF_AUTH_GENERIC_OAUTH_AUTH_URL
value: "https://authentik.dooplex.hu/application/o/authorize/"
- name: GF_AUTH_GENERIC_OAUTH_TOKEN_URL
value: "https://authentik.dooplex.hu/application/o/token/"
- name: GF_AUTH_GENERIC_OAUTH_API_URL
value: "https://authentik.dooplex.hu/application/o/userinfo/"
- name: GF_AUTH_SIGNOUT_REDIRECT_URL
value: "https://authentik.dooplex.hu/application/o/grafana/end-session/"
- name: GF_AUTH_OAUTH_AUTO_LOGIN
value: "true"
- name: GF_AUTH_GENERIC_OAUTH_GROUPS_ATTRIBUTE_PATH
value: "groups"
- name: GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH
value: "(contains(groups[*], 'Server Admin') || contains(groups[*], 'Grafana Admins')) && 'Admin' || 'Viewer'"
livenessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
volumeMounts:
- name: data
mountPath: /var/lib/grafana
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
volumes:
- name: data
persistentVolumeClaim:
claimName: grafana-data
- name: datasources
configMap:
name: grafana-datasources
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: mon-system
labels:
app: grafana
annotations:
external-dns.alpha.kubernetes.io/hostname: grafana.home
spec:
type: ClusterIP
ports:
- port: 3000
targetPort: 3000
name: http
selector:
app: grafana
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: mon-system
labels:
app: grafana
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
external-dns.alpha.kubernetes.io/hostname: grafana.dooplex.hu,grafana.home
nginx.ingress.kubernetes.io/ssl-redirect: "true"
spec:
ingressClassName: nginx-internal
rules:
- host: grafana.dooplex.hu
http:
paths:
- backend:
service:
name: grafana
port:
number: 3000
path: /
pathType: Prefix
- host: grafana.home
http:
paths:
- backend:
service:
name: grafana
port:
number: 3000
path: /
pathType: Prefix
tls:
- hosts:
- grafana.dooplex.hu
secretName: grafana-tls
# =============================================================================
# NODE EXPORTER - Host metrics (CPU, RAM, Disk, Network)
# Runs on the host network to collect host metrics
# =============================================================================
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: mon-system
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
hostNetwork: true
hostPID: true
containers:
- name: node-exporter
image: prom/node-exporter:v1.10.2
args:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/host/root"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
- "--collector.textfile.directory=/host/textfile"
ports:
- containerPort: 9100
name: metrics
hostPort: 9100
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 200m
memory: 64Mi
securityContext:
privileged: true
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
- name: textfile
mountPath: /host/textfile
readOnly: true
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
- name: textfile
hostPath:
path: /var/lib/node_exporter/textfile_collector
type: DirectoryOrCreate
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: mon-system
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
type: ClusterIP
ports:
- port: 9100
targetPort: 9100
name: metrics
selector:
app: node-exporter
# =============================================================================
# EXPORTARR - Metrics for Sonarr, Radarr, Prowlarr
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: exportarr-sonarr
namespace: mon-system
labels:
app: exportarr-sonarr
spec:
replicas: 1
selector:
matchLabels:
app: exportarr-sonarr
template:
metadata:
labels:
app: exportarr-sonarr
spec:
containers:
- name: exportarr
image: ghcr.io/onedr0p/exportarr:v2.3.0
args: ["sonarr"]
env:
- name: PORT
value: "9707"
- name: URL
value: "http://sonarr.servarr-system:8989"
- name: APIKEY
value: "d6cc5c40708943d7893858743ec7bf8f"
ports:
- containerPort: 9707
name: metrics
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
---
apiVersion: v1
kind: Service
metadata:
name: exportarr-sonarr
namespace: mon-system
spec:
ports:
- port: 9707
targetPort: 9707
selector:
app: exportarr-sonarr
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: exportarr-radarr
namespace: mon-system
labels:
app: exportarr-radarr
spec:
replicas: 1
selector:
matchLabels:
app: exportarr-radarr
template:
metadata:
labels:
app: exportarr-radarr
spec:
containers:
- name: exportarr
image: ghcr.io/onedr0p/exportarr:v2.3.0
args: ["radarr"]
env:
- name: PORT
value: "9708"
- name: URL
value: "http://radarr.servarr-system:7878"
- name: APIKEY
value: "2c41adc924014a90985c9976472dcf2f"
ports:
- containerPort: 9708
name: metrics
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
---
apiVersion: v1
kind: Service
metadata:
name: exportarr-radarr
namespace: mon-system
spec:
ports:
- port: 9708
targetPort: 9708
selector:
app: exportarr-radarr
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: exportarr-prowlarr
namespace: mon-system
labels:
app: exportarr-prowlarr
spec:
replicas: 1
selector:
matchLabels:
app: exportarr-prowlarr
template:
metadata:
labels:
app: exportarr-prowlarr
spec:
containers:
- name: exportarr
image: ghcr.io/onedr0p/exportarr:v2.3.0
args: ["prowlarr"]
env:
- name: PORT
value: "9709"
- name: URL
value: "http://prowlarr.servarr-system:9696"
- name: APIKEY
value: "328c3963df5a42808d09dd00942d932c"
ports:
- containerPort: 9709
name: metrics
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
---
apiVersion: v1
kind: Service
metadata:
name: exportarr-prowlarr
namespace: mon-system
spec:
ports:
- port: 9709
targetPort: 9709
selector:
app: exportarr-prowlarr