# ============================================================================= # MONITORING STACK - Prometheus + Grafana # Namespace: mon-system # ============================================================================= --- apiVersion: v1 kind: Namespace metadata: name: mon-system labels: name: mon-system # ============================================================================= # PROMETHEUS CONFIGURATION # ============================================================================= --- # ============================================================================= # Updated Prometheus ConfigMap with Alerting Configuration # ============================================================================= apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: mon-system labels: app: prometheus data: prometheus.yml: | global: scrape_interval: 30s evaluation_interval: 30s scrape_timeout: 10s # Alertmanager configuration - THIS WAS MISSING! alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 # Load alerting rules rule_files: - /etc/prometheus/rules/*.yml scrape_configs: # Prometheus self-monitoring - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] # Node Exporter - Host metrics - job_name: 'node-exporter' static_configs: - targets: ['node-exporter:9100'] relabel_configs: - source_labels: [__address__] target_label: instance replacement: 'dooplex' # Kube-state-metrics - Kubernetes object metrics - job_name: 'kube-state-metrics' static_configs: - targets: ['kube-state-metrics:8080'] # Nginx Ingress Controller - job_name: 'nginx-ingress' static_configs: - targets: ['ingress-nginx-internal-controller-metrics.nginx-system:10254'] # Kubernetes API server - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https metric_relabel_configs: # Drop high-cardinality histogram buckets - source_labels: [__name__] action: drop regex: '(apiserver|etcd|workqueue)_.+_bucket' # Kubernetes nodes (kubelet metrics) - job_name: 'kubernetes-nodes' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) metric_relabel_configs: # Drop histogram buckets from kubelet too - source_labels: [__name__] action: drop regex: '.+_bucket' # Kubernetes nodes cadvisor (container metrics) - job_name: 'kubernetes-cadvisor' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token metrics_path: /metrics/cadvisor relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # Pi-hole - job_name: 'pihole' scrape_interval: 60s static_configs: - targets: ['pihole-exporter.pihole-system:9617'] # Kubernetes services - EXCLUDE services that have dedicated jobs - job_name: 'kubernetes-services' kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true # DROP services that already have dedicated scrape jobs - source_labels: [__meta_kubernetes_service_name] action: drop regex: (node-exporter|kube-state-metrics|prometheus|ingress-nginx-internal-controller-metrics) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service # Servarr stack metrics - 60s interval (less critical) - job_name: 'sonarr' scrape_interval: 60s static_configs: - targets: ['exportarr-sonarr:9707'] - job_name: 'radarr' scrape_interval: 60s static_configs: - targets: ['exportarr-radarr:9708'] - job_name: 'prowlarr' scrape_interval: 60s static_configs: - targets: ['exportarr-prowlarr:9709'] # Longhorn metrics - 60s interval - job_name: 'longhorn' scrape_interval: 60s static_configs: - targets: ['longhorn-backend.longhorn-system:9500'] # cert-manager metrics - 60s interval - job_name: 'cert-manager' scrape_interval: 60s static_configs: - targets: ['cert-manager.cert-manager:9402'] # Alertmanager metrics - job_name: 'alertmanager' static_configs: - targets: ['alertmanager:9093'] # version-checker metrics - job_name: 'version-checker' scrape_interval: 60s static_configs: - targets: ['version-checker.version-checker-system:8080'] # Authentik server metrics (HTTP latency, outposts connected, flow/policy cache) - job_name: 'authentik-server' static_configs: - targets: ['authentik-server-metrics.auth-system:9300'] labels: namespace: 'auth-system' # Authentik worker metrics (task queue depth/throughput, DB query latency) - job_name: 'authentik-worker' static_configs: - targets: ['authentik-worker-metrics.auth-system:9300'] labels: namespace: 'auth-system' # Authentik outposts - SD with ports 9300 on ak-outpost-* services - job_name: 'authentik-outposts' kubernetes_sd_configs: - role: service namespaces: names: ['auth-system'] relabel_configs: - source_labels: [__meta_kubernetes_service_name] action: keep regex: 'ak-outpost-.*-outpost' - source_labels: [__meta_kubernetes_service_port_number] action: keep regex: '9300' - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: service - source_labels: [__meta_kubernetes_service_name] action: replace target_label: outpost regex: 'ak-outpost-(.*)-outpost' replacement: '$1' # CloudNativePG - Postgres metrics per instance - job_name: 'cloudnativepg' kubernetes_sd_configs: - role: pod namespaces: names: ['database-system'] relabel_configs: # Keep only CNPG instance pods - source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster] action: keep regex: '.+' # Keep only the metrics port (9187) - source_labels: [__meta_kubernetes_pod_container_port_number] action: keep regex: '9187' - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: pod - source_labels: [__meta_kubernetes_pod_label_cnpg_io_cluster] action: replace target_label: cluster --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-data namespace: mon-system labels: app: prometheus recurring-job-group.longhorn.io/default: disabled spec: accessModes: - ReadWriteOnce storageClassName: longhorn resources: requests: storage: 30Gi --- apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: mon-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] resources: - nodes - nodes/proxy - nodes/metrics - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: ["extensions", "networking.k8s.io"] resources: - ingresses verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics", "/metrics/cadvisor"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: mon-system --- # ============================================================================= # Updated Prometheus Deployment with Rules Volume # ============================================================================= apiVersion: apps/v1 kind: Deployment metadata: labels: app: prometheus name: prometheus namespace: mon-system spec: replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: prometheus strategy: type: Recreate template: metadata: labels: app: prometheus spec: containers: - name: prometheus image: prom/prometheus:v3.9.1 args: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.path=/prometheus - --storage.tsdb.retention.time=60d - --storage.tsdb.retention.size=19GB - --web.enable-lifecycle - --web.enable-admin-api ports: - containerPort: 9090 name: http resources: requests: cpu: 100m memory: 256Mi limits: cpu: "2" memory: 6Gi livenessProbe: httpGet: path: /-/healthy port: 9090 initialDelaySeconds: 30 periodSeconds: 15 readinessProbe: httpGet: path: /-/ready port: 9090 initialDelaySeconds: 5 periodSeconds: 5 volumeMounts: - name: config mountPath: /etc/prometheus - name: rules mountPath: /etc/prometheus/rules - name: data mountPath: /prometheus securityContext: fsGroup: 65534 runAsNonRoot: true runAsUser: 65534 serviceAccountName: prometheus volumes: - name: config configMap: name: prometheus-config - name: rules configMap: name: prometheus-rules - name: data persistentVolumeClaim: claimName: prometheus-data --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: mon-system labels: app: prometheus annotations: prometheus.io/scrape: "true" prometheus.io/port: "9090" spec: type: ClusterIP ports: - port: 9090 targetPort: 9090 name: http selector: app: prometheus --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: prometheus namespace: mon-system annotations: external-dns.alpha.kubernetes.io/hostname: prometheus.home spec: ingressClassName: nginx-internal rules: - host: prometheus.home http: paths: - backend: service: name: prometheus port: number: 9090 path: / pathType: Prefix # ============================================================================= # GRAFANA CONFIGURATION # ============================================================================= --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-datasources namespace: mon-system labels: app: grafana data: datasources.yaml: | apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: true --- apiVersion: v1 kind: ConfigMap metadata: name: grafana-dashboards-config namespace: mon-system labels: app: grafana data: dashboards.yaml: | apiVersion: 1 providers: - name: 'default' orgId: 1 folder: '' type: file disableDeletion: false editable: true options: path: /var/lib/grafana/dashboards --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana-data namespace: mon-system labels: app: grafana spec: accessModes: - ReadWriteOnce storageClassName: longhorn resources: requests: storage: 4Gi --- apiVersion: apps/v1 kind: Deployment metadata: name: grafana namespace: mon-system labels: app: grafana spec: replicas: 1 strategy: type: Recreate selector: matchLabels: app: grafana template: metadata: labels: app: grafana annotations: match-regex.version-checker.io/grafana: '^[0-9]+\.[0-9]+\.[0-9]+$' spec: securityContext: fsGroup: 472 runAsUser: 472 runAsGroup: 472 containers: - name: grafana image: grafana/grafana:12.3.2 ports: - containerPort: 3000 name: http env: - name: GF_SECURITY_ALLOW_EMBEDDING value: "true" - name: GF_SECURITY_ADMIN_USER valueFrom: secretKeyRef: name: grafana-credentials key: admin-user - name: GF_SECURITY_ADMIN_PASSWORD valueFrom: secretKeyRef: name: grafana-credentials key: admin-password - name: GF_SERVER_ROOT_URL value: "https://grafana.dooplex.hu" - name: GF_AUTH_GENERIC_OAUTH_ENABLED value: "true" - name: GF_AUTH_GENERIC_OAUTH_NAME value: "authentik" - name: GF_AUTH_GENERIC_OAUTH_CLIENT_ID valueFrom: secretKeyRef: name: grafana-oauth key: client-id - name: GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET valueFrom: secretKeyRef: name: grafana-oauth key: client-secret - name: GF_AUTH_GENERIC_OAUTH_SCOPES value: "openid profile email grafana-admin-role" - name: GF_AUTH_GENERIC_OAUTH_AUTH_URL value: "https://authentik.dooplex.hu/application/o/authorize/" - name: GF_AUTH_GENERIC_OAUTH_TOKEN_URL value: "https://authentik.dooplex.hu/application/o/token/" - name: GF_AUTH_GENERIC_OAUTH_API_URL value: "https://authentik.dooplex.hu/application/o/userinfo/" - name: GF_AUTH_SIGNOUT_REDIRECT_URL value: "https://authentik.dooplex.hu/application/o/grafana/end-session/" - name: GF_AUTH_OAUTH_AUTO_LOGIN value: "true" - name: GF_AUTH_GENERIC_OAUTH_GROUPS_ATTRIBUTE_PATH value: "groups" - name: GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH value: "(contains(groups[*], 'Server Admin') || contains(groups[*], 'Grafana Admins')) && 'Admin' || 'Viewer'" - name: GF_AUTH_ANONYMOUS_ENABLED value: "true" - name: GF_AUTH_ANONYMOUS_ORG_ROLE value: "Viewer" - name: GF_AUTH_ANONYMOUS_ORG_NAME value: "Main Org." - name: GF_AUTH_DISABLE_LOGIN_FORM value: "true" - name: GF_SMTP_ENABLED value: "true" - name: GF_SMTP_HOST value: "smtp.resend.com:587" - name: GF_SMTP_USER value: "resend" - name: GF_SMTP_PASSWORD valueFrom: secretKeyRef: name: grafana-smtp key: resend-api-key - name: GF_SMTP_FROM_ADDRESS value: "monitoring@jarrs.eu" - name: GF_SMTP_FROM_NAME value: "JARR Monitoring" - name: GF_SMTP_STARTTLS_POLICY value: "MandatoryStartTLS" livenessProbe: httpGet: path: /api/health port: 3000 initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /api/health port: 3000 initialDelaySeconds: 5 periodSeconds: 5 resources: requests: cpu: 100m memory: 128Mi limits: cpu: 500m memory: 256Mi volumeMounts: - name: data mountPath: /var/lib/grafana - name: datasources mountPath: /etc/grafana/provisioning/datasources volumes: - name: data persistentVolumeClaim: claimName: grafana-data - name: datasources configMap: name: grafana-datasources --- apiVersion: v1 kind: Service metadata: name: grafana namespace: mon-system labels: app: grafana annotations: external-dns.alpha.kubernetes.io/hostname: grafana.home spec: type: ClusterIP ports: - port: 3000 targetPort: 3000 name: http selector: app: grafana --- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: grafana namespace: mon-system labels: app: grafana annotations: cert-manager.io/cluster-issuer: letsencrypt-prod external-dns.alpha.kubernetes.io/hostname: grafana.dooplex.hu,grafana.home nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/configuration-snippet: | set $geo_allowed 0; if ($remote_addr ~ "^192\.168\.") { set $geo_allowed 1; } if ($remote_addr ~ "^10\.") { set $geo_allowed 1; } if ($geoip2_country_code = "HU") { set $geo_allowed 1; } if ($geo_allowed = 0) { return 403 "Access restricted to Hungary"; } spec: ingressClassName: nginx-internal rules: - host: grafana.dooplex.hu http: paths: - backend: service: name: grafana port: number: 3000 path: / pathType: Prefix - host: grafana.home http: paths: - backend: service: name: grafana port: number: 3000 path: / pathType: Prefix tls: - hosts: - grafana.dooplex.hu secretName: grafana-tls # ============================================================================= # NODE EXPORTER - Host metrics (CPU, RAM, Disk, Network) # Runs on the host network to collect host metrics # ============================================================================= --- apiVersion: apps/v1 kind: DaemonSet metadata: name: node-exporter namespace: mon-system labels: app: node-exporter spec: selector: matchLabels: app: node-exporter template: metadata: labels: app: node-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" spec: hostNetwork: true hostPID: true containers: - name: node-exporter image: prom/node-exporter:v1.11.1 args: - "--path.procfs=/host/proc" - "--path.sysfs=/host/sys" - "--path.rootfs=/host/root" - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" - "--collector.textfile.directory=/host/textfile" ports: - containerPort: 9100 name: metrics hostPort: 9100 resources: requests: cpu: 50m memory: 32Mi limits: cpu: 200m memory: 64Mi securityContext: privileged: true volumeMounts: - name: proc mountPath: /host/proc readOnly: true - name: sys mountPath: /host/sys readOnly: true - name: root mountPath: /host/root mountPropagation: HostToContainer readOnly: true - name: textfile mountPath: /host/textfile readOnly: true tolerations: - effect: NoSchedule operator: Exists volumes: - name: proc hostPath: path: /proc - name: sys hostPath: path: /sys - name: root hostPath: path: / - name: textfile hostPath: path: /var/lib/node_exporter/textfile_collector type: DirectoryOrCreate --- apiVersion: v1 kind: Service metadata: name: node-exporter namespace: mon-system labels: app: node-exporter annotations: prometheus.io/scrape: "true" prometheus.io/port: "9100" spec: type: ClusterIP ports: - port: 9100 targetPort: 9100 name: metrics selector: app: node-exporter # ============================================================================= # EXPORTARR - Metrics for Sonarr, Radarr, Prowlarr # ============================================================================= --- apiVersion: apps/v1 kind: Deployment metadata: name: exportarr-sonarr namespace: mon-system labels: app: exportarr-sonarr spec: replicas: 1 selector: matchLabels: app: exportarr-sonarr template: metadata: labels: app: exportarr-sonarr spec: containers: - name: exportarr image: ghcr.io/onedr0p/exportarr:v2.3.0 args: ["sonarr"] env: - name: PORT value: "9707" - name: URL value: "http://sonarr.servarr-system:8989" - name: APIKEY value: "d6cc5c40708943d7893858743ec7bf8f" ports: - containerPort: 9707 name: metrics resources: requests: cpu: 10m memory: 32Mi limits: cpu: 100m memory: 64Mi --- apiVersion: v1 kind: Service metadata: name: exportarr-sonarr namespace: mon-system spec: ports: - port: 9707 targetPort: 9707 selector: app: exportarr-sonarr --- apiVersion: apps/v1 kind: Deployment metadata: name: exportarr-radarr namespace: mon-system labels: app: exportarr-radarr spec: replicas: 1 selector: matchLabels: app: exportarr-radarr template: metadata: labels: app: exportarr-radarr spec: containers: - name: exportarr image: ghcr.io/onedr0p/exportarr:v2.3.0 args: ["radarr"] env: - name: PORT value: "9708" - name: URL value: "http://radarr.servarr-system:7878" - name: APIKEY value: "2c41adc924014a90985c9976472dcf2f" ports: - containerPort: 9708 name: metrics resources: requests: cpu: 10m memory: 32Mi limits: cpu: 100m memory: 64Mi --- apiVersion: v1 kind: Service metadata: name: exportarr-radarr namespace: mon-system spec: ports: - port: 9708 targetPort: 9708 selector: app: exportarr-radarr --- apiVersion: apps/v1 kind: Deployment metadata: name: exportarr-prowlarr namespace: mon-system labels: app: exportarr-prowlarr spec: replicas: 1 selector: matchLabels: app: exportarr-prowlarr template: metadata: labels: app: exportarr-prowlarr spec: containers: - name: exportarr image: ghcr.io/onedr0p/exportarr:v2.3.0 args: ["prowlarr"] env: - name: PORT value: "9709" - name: URL value: "http://prowlarr.servarr-system:9696" - name: APIKEY value: "328c3963df5a42808d09dd00942d932c" ports: - containerPort: 9709 name: metrics resources: requests: cpu: 10m memory: 32Mi limits: cpu: 100m memory: 64Mi --- apiVersion: v1 kind: Service metadata: name: exportarr-prowlarr namespace: mon-system spec: ports: - port: 9709 targetPort: 9709 selector: app: exportarr-prowlarr