From 72651daa82135018af7e80e5c016c0149b6ade24 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Sun, 28 Dec 2025 11:48:15 +0100 Subject: [PATCH] added monitoring --- argocd-apps/homelab.yaml | 22 + mon-system/monitoring.yaml | 831 +++++++++++++++++++++++++++++++++++++ 2 files changed, 853 insertions(+) create mode 100644 mon-system/monitoring.yaml diff --git a/argocd-apps/homelab.yaml b/argocd-apps/homelab.yaml index ebb4935..d1b6cd5 100644 --- a/argocd-apps/homelab.yaml +++ b/argocd-apps/homelab.yaml @@ -570,3 +570,25 @@ spec: factor: 2 maxDuration: 3m --- +# Monitoring (Grafana, Prometheus) +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: monitoring + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: homelab + source: + repoURL: https://gitea.dooplex.hu/admin/homelab-manifests.git + targetRevision: main + path: mon-system + destination: + server: https://kubernetes.default.svc + namespace: mon-system + syncPolicy: + syncOptions: + - CreateNamespace=true + - PruneLast=true +--- diff --git a/mon-system/monitoring.yaml b/mon-system/monitoring.yaml new file mode 100644 index 0000000..5c183c8 --- /dev/null +++ b/mon-system/monitoring.yaml @@ -0,0 +1,831 @@ +# ============================================================================= +# MONITORING STACK - Prometheus + Grafana +# Namespace: mon-system +# ============================================================================= + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: mon-system + labels: + name: mon-system + +# ============================================================================= +# PROMETHEUS CONFIGURATION +# ============================================================================= +--- +# ============================================================================= +# Updated Prometheus ConfigMap with Alerting Configuration +# ============================================================================= + +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: mon-system + labels: + app: prometheus +data: + prometheus.yml: | + global: + scrape_interval: 30s + evaluation_interval: 30s + scrape_timeout: 10s + + # Alertmanager configuration - THIS WAS MISSING! + alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + + # Load alerting rules + rule_files: + - /etc/prometheus/rules/*.yml + + scrape_configs: + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Node Exporter - Host metrics + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter:9100'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'dooplex' + + # Kube-state-metrics - Kubernetes object metrics + - job_name: 'kube-state-metrics' + static_configs: + - targets: ['kube-state-metrics:8080'] + + # Nginx Ingress Controller + - job_name: 'nginx-ingress' + static_configs: + - targets: ['ingress-nginx-internal-controller-metrics.nginx-system:10254'] + + # Kubernetes API server + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + metric_relabel_configs: + # Drop high-cardinality histogram buckets + - source_labels: [__name__] + action: drop + regex: '(apiserver|etcd|workqueue)_.+_bucket' + + # Kubernetes nodes (kubelet metrics) + - job_name: 'kubernetes-nodes' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + metric_relabel_configs: + # Drop histogram buckets from kubelet too + - source_labels: [__name__] + action: drop + regex: '.+_bucket' + + # Kubernetes nodes cadvisor (container metrics) + - job_name: 'kubernetes-cadvisor' + kubernetes_sd_configs: + - role: node + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + metrics_path: /metrics/cadvisor + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + + # Pi-hole + - job_name: 'pihole' + scrape_interval: 60s + static_configs: + - targets: ['pihole-exporter.pihole-system:9617'] + + # Kubernetes services - EXCLUDE services that have dedicated jobs + - job_name: 'kubernetes-services' + kubernetes_sd_configs: + - role: service + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + # DROP services that already have dedicated scrape jobs + - source_labels: [__meta_kubernetes_service_name] + action: drop + regex: (node-exporter|kube-state-metrics|prometheus|ingress-nginx-internal-controller-metrics) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: service + + # Servarr stack metrics - 60s interval (less critical) + - job_name: 'sonarr' + scrape_interval: 60s + static_configs: + - targets: ['exportarr-sonarr:9707'] + + - job_name: 'radarr' + scrape_interval: 60s + static_configs: + - targets: ['exportarr-radarr:9708'] + + - job_name: 'prowlarr' + scrape_interval: 60s + static_configs: + - targets: ['exportarr-prowlarr:9709'] + + # Longhorn metrics - 60s interval + - job_name: 'longhorn' + scrape_interval: 60s + static_configs: + - targets: ['longhorn-backend.longhorn-system:9500'] + + # cert-manager metrics - 60s interval + - job_name: 'cert-manager' + scrape_interval: 60s + static_configs: + - targets: ['cert-manager.cert-manager:9402'] + + # Alertmanager metrics + - job_name: 'alertmanager' + static_configs: + - targets: ['alertmanager:9093'] +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-data + namespace: mon-system + labels: + app: prometheus +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 20Gi + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: mon-system + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - nodes/metrics + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions", "networking.k8s.io"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: mon-system + +--- +# ============================================================================= +# Updated Prometheus Deployment with Rules Volume +# ============================================================================= + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: prometheus + name: prometheus + namespace: mon-system +spec: + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: prometheus + strategy: + type: Recreate + template: + metadata: + labels: + app: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus:v3.8.0 + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --storage.tsdb.retention.size=19GB + - --web.enable-lifecycle + - --web.enable-admin-api + ports: + - containerPort: 9090 + name: http + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: '2' + memory: 6Gi + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 5 + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: rules + mountPath: /etc/prometheus/rules + - name: data + mountPath: /prometheus + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: prometheus + volumes: + - name: config + configMap: + name: prometheus-config + - name: rules + configMap: + name: prometheus-rules + - name: data + persistentVolumeClaim: + claimName: prometheus-data +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: mon-system + labels: + app: prometheus + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" +spec: + type: ClusterIP + ports: + - port: 9090 + targetPort: 9090 + name: http + selector: + app: prometheus + +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus + namespace: mon-system + annotations: + external-dns.alpha.kubernetes.io/hostname: prometheus.home +spec: + ingressClassName: nginx-internal + rules: + - host: prometheus.home + http: + paths: + - backend: + service: + name: prometheus + port: + number: 9090 + path: / + pathType: Prefix + +# ============================================================================= +# GRAFANA CONFIGURATION +# ============================================================================= +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: mon-system + labels: + app: grafana +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-config + namespace: mon-system + labels: + app: grafana +data: + dashboards.yaml: | + apiVersion: 1 + providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-data + namespace: mon-system + labels: + app: grafana +spec: + accessModes: + - ReadWriteOnce + storageClassName: longhorn + resources: + requests: + storage: 1Gi + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: mon-system + labels: + app: grafana +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + securityContext: + fsGroup: 472 + runAsUser: 472 + runAsGroup: 472 + containers: + - name: grafana + image: grafana/grafana:12.3.0 + ports: + - containerPort: 3000 + name: http + env: + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana-credentials + key: admin-user + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana-credentials + key: admin-password + - name: GF_SERVER_ROOT_URL + value: "https://grafana.dooplex.hu" + - name: GF_INSTALL_PLUGINS + value: "grafana-piechart-panel,grafana-clock-panel" + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 5 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + volumeMounts: + - name: data + mountPath: /var/lib/grafana + - name: datasources + mountPath: /etc/grafana/provisioning/datasources + - name: dashboards-config + mountPath: /etc/grafana/provisioning/dashboards + volumes: + - name: data + persistentVolumeClaim: + claimName: grafana-data + - name: datasources + configMap: + name: grafana-datasources + - name: dashboards-config + configMap: + name: grafana-dashboards-config + +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: mon-system + labels: + app: grafana + annotations: + external-dns.alpha.kubernetes.io/hostname: grafana.home +spec: + type: ClusterIP + ports: + - port: 3000 + targetPort: 3000 + name: http + selector: + app: grafana + +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana + namespace: mon-system + labels: + app: grafana + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + external-dns.alpha.kubernetes.io/hostname: grafana.dooplex.hu,grafana.home + nginx.ingress.kubernetes.io/ssl-redirect: "true" +spec: + ingressClassName: nginx-internal + rules: + - host: grafana.dooplex.hu + http: + paths: + - backend: + service: + name: grafana + port: + number: 3000 + path: / + pathType: Prefix + - host: grafana.home + http: + paths: + - backend: + service: + name: grafana + port: + number: 3000 + path: / + pathType: Prefix + tls: + - hosts: + - grafana.dooplex.hu + secretName: grafana-tls + +# ============================================================================= +# NODE EXPORTER - Host metrics (CPU, RAM, Disk, Network) +# Runs on the host network to collect host metrics +# ============================================================================= +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: mon-system + labels: + app: node-exporter +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" + spec: + hostNetwork: true + hostPID: true + containers: + - name: node-exporter + image: prom/node-exporter:v1.10.2 + args: + - "--path.procfs=/host/proc" + - "--path.sysfs=/host/sys" + - "--path.rootfs=/host/root" + - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" + - "--collector.textfile.directory=/host/textfile" + ports: + - containerPort: 9100 + name: metrics + hostPort: 9100 + resources: + requests: + cpu: 50m + memory: 32Mi + limits: + cpu: 200m + memory: 64Mi + securityContext: + privileged: true + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + mountPropagation: HostToContainer + readOnly: true + - name: textfile + mountPath: /host/textfile + readOnly: true + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / + - name: textfile + hostPath: + path: /var/lib/node_exporter/textfile_collector + type: DirectoryOrCreate + +--- +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: mon-system + labels: + app: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" +spec: + type: ClusterIP + ports: + - port: 9100 + targetPort: 9100 + name: metrics + selector: + app: node-exporter + +# ============================================================================= +# EXPORTARR - Metrics for Sonarr, Radarr, Prowlarr +# ============================================================================= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: exportarr-sonarr + namespace: mon-system + labels: + app: exportarr-sonarr +spec: + replicas: 1 + selector: + matchLabels: + app: exportarr-sonarr + template: + metadata: + labels: + app: exportarr-sonarr + spec: + containers: + - name: exportarr + image: ghcr.io/onedr0p/exportarr:v2.3.0 + args: ["sonarr"] + env: + - name: PORT + value: "9707" + - name: URL + value: "http://sonarr.servarr-system:8989" + - name: APIKEY + value: "2bac5d00dca43258313c734821a15c4c" + ports: + - containerPort: 9707 + name: metrics + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + +--- +apiVersion: v1 +kind: Service +metadata: + name: exportarr-sonarr + namespace: mon-system +spec: + ports: + - port: 9707 + targetPort: 9707 + selector: + app: exportarr-sonarr + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: exportarr-radarr + namespace: mon-system + labels: + app: exportarr-radarr +spec: + replicas: 1 + selector: + matchLabels: + app: exportarr-radarr + template: + metadata: + labels: + app: exportarr-radarr + spec: + containers: + - name: exportarr + image: ghcr.io/onedr0p/exportarr:v2.3.0 + args: ["radarr"] + env: + - name: PORT + value: "9708" + - name: URL + value: "http://radarr.servarr-system:7878" + - name: APIKEY + value: "4fac7d10eca54269424d835a2edc15d2" + ports: + - containerPort: 9708 + name: metrics + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + +--- +apiVersion: v1 +kind: Service +metadata: + name: exportarr-radarr + namespace: mon-system +spec: + ports: + - port: 9708 + targetPort: 9708 + selector: + app: exportarr-radarr + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: exportarr-prowlarr + namespace: mon-system + labels: + app: exportarr-prowlarr +spec: + replicas: 1 + selector: + matchLabels: + app: exportarr-prowlarr + template: + metadata: + labels: + app: exportarr-prowlarr + spec: + containers: + - name: exportarr + image: ghcr.io/onedr0p/exportarr:v2.3.0 + args: ["prowlarr"] + env: + - name: PORT + value: "9709" + - name: URL + value: "http://prowlarr.servarr-system:9696" + - name: APIKEY + value: "c04914c6bfad445a3edc23e5edbca4d1" + ports: + - containerPort: 9709 + name: metrics + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + +--- +apiVersion: v1 +kind: Service +metadata: + name: exportarr-prowlarr + namespace: mon-system +spec: + ports: + - port: 9709 + targetPort: 9709 + selector: + app: exportarr-prowlarr