Files
homelab-manifests/mon-system/monitoring.yaml

967 lines
26 KiB
YAML

# =============================================================================
# MONITORING STACK - Prometheus + Grafana
# Namespace: mon-system
# =============================================================================
---
apiVersion: v1
kind: Namespace
metadata:
name: mon-system
labels:
name: mon-system
# =============================================================================
# PROMETHEUS CONFIGURATION
# =============================================================================
---
# =============================================================================
# Updated Prometheus ConfigMap with Alerting Configuration
# =============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: mon-system
labels:
app: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 30s
evaluation_interval: 30s
scrape_timeout: 10s
# Alertmanager configuration - THIS WAS MISSING!
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load alerting rules
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Node Exporter - Host metrics
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'dooplex'
# Kube-state-metrics - Kubernetes object metrics
- job_name: 'kube-state-metrics'
static_configs:
- targets: ['kube-state-metrics:8080']
# Nginx Ingress Controller
- job_name: 'nginx-ingress'
static_configs:
- targets: ['ingress-nginx-internal-controller-metrics.nginx-system:10254']
# Kubernetes API server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
metric_relabel_configs:
# Drop high-cardinality histogram buckets
- source_labels: [__name__]
action: drop
regex: '(apiserver|etcd|workqueue)_.+_bucket'
# Kubernetes nodes (kubelet metrics)
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
metric_relabel_configs:
# Drop histogram buckets from kubelet too
- source_labels: [__name__]
action: drop
regex: '.+_bucket'
# Kubernetes nodes cadvisor (container metrics)
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: /metrics/cadvisor
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Pi-hole
- job_name: 'pihole'
scrape_interval: 60s
static_configs:
- targets: ['pihole-exporter.pihole-system:9617']
# Kubernetes services - EXCLUDE services that have dedicated jobs
- job_name: 'kubernetes-services'
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
# DROP services that already have dedicated scrape jobs
- source_labels: [__meta_kubernetes_service_name]
action: drop
regex: (node-exporter|kube-state-metrics|prometheus|ingress-nginx-internal-controller-metrics)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: service
# Servarr stack metrics - 60s interval (less critical)
- job_name: 'sonarr'
scrape_interval: 60s
static_configs:
- targets: ['exportarr-sonarr:9707']
- job_name: 'radarr'
scrape_interval: 60s
static_configs:
- targets: ['exportarr-radarr:9708']
- job_name: 'prowlarr'
scrape_interval: 60s
static_configs:
- targets: ['exportarr-prowlarr:9709']
# Longhorn metrics - 60s interval
- job_name: 'longhorn'
scrape_interval: 60s
static_configs:
- targets: ['longhorn-backend.longhorn-system:9500']
# cert-manager metrics - 60s interval
- job_name: 'cert-manager'
scrape_interval: 60s
static_configs:
- targets: ['cert-manager.cert-manager:9402']
# Alertmanager metrics
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
# version-checker metrics
- job_name: 'version-checker'
scrape_interval: 60s
static_configs:
- targets: ['version-checker.version-checker-system:8080']
# Authentik server metrics (HTTP latency, outposts connected, flow/policy cache)
- job_name: 'authentik-server'
static_configs:
- targets: ['authentik-server-metrics.auth-system:9300']
labels:
namespace: 'auth-system'
# Authentik worker metrics (task queue depth/throughput, DB query latency)
- job_name: 'authentik-worker'
static_configs:
- targets: ['authentik-worker-metrics.auth-system:9300']
labels:
namespace: 'auth-system'
# Authentik outposts - SD with ports 9300 on ak-outpost-* services
- job_name: 'authentik-outposts'
kubernetes_sd_configs:
- role: service
namespaces:
names: ['auth-system']
relabel_configs:
- source_labels: [__meta_kubernetes_service_name]
action: keep
regex: 'ak-outpost-.*-outpost'
- source_labels: [__meta_kubernetes_service_port_number]
action: keep
regex: '9300'
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: service
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: outpost
regex: 'ak-outpost-(.*)-outpost'
replacement: '$1'
# CloudNativePG - Postgres metrics per instance
- job_name: 'cloudnativepg'
kubernetes_sd_configs:
- role: pod
namespaces:
names: ['database-system']
relabel_configs:
# Keep only CNPG instance pods
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
action: keep
regex: '.+'
# Keep only the metrics port (9187)
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: keep
regex: '9187'
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: pod
- source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster]
action: replace
target_label: cluster
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-data
namespace: mon-system
labels:
app: prometheus
recurring-job-group.longhorn.io/default: disabled
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 30Gi
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: mon-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions", "networking.k8s.io"]
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: mon-system
---
# =============================================================================
# Updated Prometheus Deployment with Rules Volume
# =============================================================================
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: prometheus
name: prometheus
namespace: mon-system
spec:
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: prometheus
strategy:
type: Recreate
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:v3.12.0
args:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=60d
- --storage.tsdb.retention.size=19GB
- --web.enable-lifecycle
- --web.enable-admin-api
ports:
- containerPort: 9090
name: http
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: "2"
memory: 6Gi
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: rules
mountPath: /etc/prometheus/rules
- name: data
mountPath: /prometheus
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: prometheus
volumes:
- name: config
configMap:
name: prometheus-config
- name: rules
configMap:
name: prometheus-rules
- name: data
persistentVolumeClaim:
claimName: prometheus-data
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: mon-system
labels:
app: prometheus
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
name: http
selector:
app: prometheus
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: mon-system
annotations:
external-dns.alpha.kubernetes.io/hostname: prometheus.home
spec:
ingressClassName: nginx-internal
rules:
- host: prometheus.home
http:
paths:
- backend:
service:
name: prometheus
port:
number: 9090
path: /
pathType: Prefix
# =============================================================================
# GRAFANA CONFIGURATION
# =============================================================================
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
namespace: mon-system
labels:
app: grafana
data:
datasources.yaml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards-config
namespace: mon-system
labels:
app: grafana
data:
dashboards.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-data
namespace: mon-system
labels:
app: grafana
spec:
accessModes:
- ReadWriteOnce
storageClassName: longhorn
resources:
requests:
storage: 4Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: mon-system
labels:
app: grafana
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
annotations:
match-regex.version-checker.io/grafana: '^[0-9]+\.[0-9]+\.[0-9]+$'
spec:
securityContext:
fsGroup: 472
runAsUser: 472
runAsGroup: 472
containers:
- name: grafana
image: grafana/grafana:12.4.4
ports:
- containerPort: 3000
name: http
env:
- name: GF_SECURITY_ALLOW_EMBEDDING
value: "true"
- name: GF_SECURITY_ADMIN_USER
valueFrom:
secretKeyRef:
name: grafana-credentials
key: admin-user
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-credentials
key: admin-password
- name: GF_SERVER_ROOT_URL
value: "https://grafana.dooplex.hu"
- name: GF_AUTH_GENERIC_OAUTH_ENABLED
value: "true"
- name: GF_AUTH_GENERIC_OAUTH_NAME
value: "authentik"
- name: GF_AUTH_GENERIC_OAUTH_CLIENT_ID
valueFrom:
secretKeyRef:
name: grafana-oauth
key: client-id
- name: GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET
valueFrom:
secretKeyRef:
name: grafana-oauth
key: client-secret
- name: GF_AUTH_GENERIC_OAUTH_SCOPES
value: "openid profile email grafana-admin-role"
- name: GF_AUTH_GENERIC_OAUTH_AUTH_URL
value: "https://authentik.dooplex.hu/application/o/authorize/"
- name: GF_AUTH_GENERIC_OAUTH_TOKEN_URL
value: "https://authentik.dooplex.hu/application/o/token/"
- name: GF_AUTH_GENERIC_OAUTH_API_URL
value: "https://authentik.dooplex.hu/application/o/userinfo/"
- name: GF_AUTH_SIGNOUT_REDIRECT_URL
value: "https://authentik.dooplex.hu/application/o/grafana/end-session/"
- name: GF_AUTH_OAUTH_AUTO_LOGIN
value: "true"
- name: GF_AUTH_GENERIC_OAUTH_GROUPS_ATTRIBUTE_PATH
value: "groups"
- name: GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH
value: "(contains(groups[*], 'Server Admin') || contains(groups[*], 'Grafana Admins')) && 'Admin' || 'Viewer'"
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
- name: GF_AUTH_ANONYMOUS_ORG_ROLE
value: "Viewer"
- name: GF_AUTH_ANONYMOUS_ORG_NAME
value: "Main Org."
- name: GF_AUTH_DISABLE_LOGIN_FORM
value: "true"
- name: GF_SMTP_ENABLED
value: "true"
- name: GF_SMTP_HOST
value: "smtp.resend.com:587"
- name: GF_SMTP_USER
value: "resend"
- name: GF_SMTP_PASSWORD
valueFrom:
secretKeyRef:
name: grafana-smtp
key: resend-api-key
- name: GF_SMTP_FROM_ADDRESS
value: "monitoring@jarrs.eu"
- name: GF_SMTP_FROM_NAME
value: "JARR Monitoring"
- name: GF_SMTP_STARTTLS_POLICY
value: "MandatoryStartTLS"
livenessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /api/health
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 256Mi
volumeMounts:
- name: data
mountPath: /var/lib/grafana
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
volumes:
- name: data
persistentVolumeClaim:
claimName: grafana-data
- name: datasources
configMap:
name: grafana-datasources
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: mon-system
labels:
app: grafana
annotations:
external-dns.alpha.kubernetes.io/hostname: grafana.home
spec:
type: ClusterIP
ports:
- port: 3000
targetPort: 3000
name: http
selector:
app: grafana
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: mon-system
labels:
app: grafana
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
external-dns.alpha.kubernetes.io/hostname: grafana.dooplex.hu,grafana.home
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/configuration-snippet: |
set $geo_allowed 0;
if ($remote_addr ~ "^192\.168\.") { set $geo_allowed 1; }
if ($remote_addr ~ "^10\.") { set $geo_allowed 1; }
if ($geoip2_country_code = "HU") { set $geo_allowed 1; }
if ($geo_allowed = 0) {
return 403 "Access restricted to Hungary";
}
spec:
ingressClassName: nginx-internal
rules:
- host: grafana.dooplex.hu
http:
paths:
- backend:
service:
name: grafana
port:
number: 3000
path: /
pathType: Prefix
- host: grafana.home
http:
paths:
- backend:
service:
name: grafana
port:
number: 3000
path: /
pathType: Prefix
tls:
- hosts:
- grafana.dooplex.hu
secretName: grafana-tls
# =============================================================================
# NODE EXPORTER - Host metrics (CPU, RAM, Disk, Network)
# Runs on the host network to collect host metrics
# =============================================================================
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: mon-system
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
hostNetwork: true
hostPID: true
containers:
- name: node-exporter
image: prom/node-exporter:v1.11.1
args:
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--path.rootfs=/host/root"
- "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
- "--collector.textfile.directory=/host/textfile"
ports:
- containerPort: 9100
name: metrics
hostPort: 9100
resources:
requests:
cpu: 50m
memory: 32Mi
limits:
cpu: 200m
memory: 64Mi
securityContext:
privileged: true
volumeMounts:
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
- name: textfile
mountPath: /host/textfile
readOnly: true
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
- name: textfile
hostPath:
path: /var/lib/node_exporter/textfile_collector
type: DirectoryOrCreate
---
apiVersion: v1
kind: Service
metadata:
name: node-exporter
namespace: mon-system
labels:
app: node-exporter
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9100"
spec:
type: ClusterIP
ports:
- port: 9100
targetPort: 9100
name: metrics
selector:
app: node-exporter
# =============================================================================
# EXPORTARR - Metrics for Sonarr, Radarr, Prowlarr
# =============================================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: exportarr-sonarr
namespace: mon-system
labels:
app: exportarr-sonarr
spec:
replicas: 1
selector:
matchLabels:
app: exportarr-sonarr
template:
metadata:
labels:
app: exportarr-sonarr
spec:
containers:
- name: exportarr
image: ghcr.io/onedr0p/exportarr:v2.3.0
args: ["sonarr"]
env:
- name: PORT
value: "9707"
- name: URL
value: "http://sonarr.servarr-system:8989"
- name: APIKEY
value: "d6cc5c40708943d7893858743ec7bf8f"
ports:
- containerPort: 9707
name: metrics
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
---
apiVersion: v1
kind: Service
metadata:
name: exportarr-sonarr
namespace: mon-system
spec:
ports:
- port: 9707
targetPort: 9707
selector:
app: exportarr-sonarr
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: exportarr-radarr
namespace: mon-system
labels:
app: exportarr-radarr
spec:
replicas: 1
selector:
matchLabels:
app: exportarr-radarr
template:
metadata:
labels:
app: exportarr-radarr
spec:
containers:
- name: exportarr
image: ghcr.io/onedr0p/exportarr:v2.3.0
args: ["radarr"]
env:
- name: PORT
value: "9708"
- name: URL
value: "http://radarr.servarr-system:7878"
- name: APIKEY
value: "2c41adc924014a90985c9976472dcf2f"
ports:
- containerPort: 9708
name: metrics
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
---
apiVersion: v1
kind: Service
metadata:
name: exportarr-radarr
namespace: mon-system
spec:
ports:
- port: 9708
targetPort: 9708
selector:
app: exportarr-radarr
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: exportarr-prowlarr
namespace: mon-system
labels:
app: exportarr-prowlarr
spec:
replicas: 1
selector:
matchLabels:
app: exportarr-prowlarr
template:
metadata:
labels:
app: exportarr-prowlarr
spec:
containers:
- name: exportarr
image: ghcr.io/onedr0p/exportarr:v2.3.0
args: ["prowlarr"]
env:
- name: PORT
value: "9709"
- name: URL
value: "http://prowlarr.servarr-system:9696"
- name: APIKEY
value: "328c3963df5a42808d09dd00942d932c"
ports:
- containerPort: 9709
name: metrics
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
---
apiVersion: v1
kind: Service
metadata:
name: exportarr-prowlarr
namespace: mon-system
spec:
ports:
- port: 9709
targetPort: 9709
selector:
app: exportarr-prowlarr