v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups

Phase 2 (Monitoring & Health):
- Central job scheduler replacing ad-hoc goroutines (internal/scheduler)
- CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go)
- Temperature reading from /sys/class/thermal + /host/sys (Docker mount)
- Load average from /proc/loadavg
- Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go)
- System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go)

Phase 3 (Backups):
- Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go)
- Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes
- Restic backup integration with auto-password generation (internal/backup/restic.go)
- Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go)
- Manual backup trigger via dashboard button and POST /api/backup/run

Dashboard UI:
- CPU usage bar with load average display
- Temperature with colored indicator dot
- Backup status card with last run time, DB count, repo stats
- "Mentés most" button for manual backup trigger

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 11:17:10 +01:00
parent 8a988c5998
commit d32d9fb44b
21 changed files with 2060 additions and 82 deletions
+160
View File
@@ -0,0 +1,160 @@
package monitor
import (
"fmt"
"os/exec"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// HealthReport contains the results of a system health check.
type HealthReport struct {
Status string // "ok", "warn", "fail"
Issues []string // critical problems
Warnings []string // non-critical warnings
Info []string // informational items
Timestamp time.Time
}
// RunHealthCheck runs system checks and returns a diagnostic report.
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector) *HealthReport {
report := &HealthReport{
Status: "ok",
Timestamp: time.Now(),
}
sysInfo := system.GetInfo(cfg.Paths.HDDPath, cpuCollector)
// 1. Disk usage (SSD)
if sysInfo.DiskPercent > 0 {
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
}
}
// HDD disk usage
if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 {
if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent))
} else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent))
}
}
// 2. Memory usage
if sysInfo.MemPercent > 0 {
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
}
}
// 3. CPU usage
if sysInfo.CPUPercent > 0 {
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
}
}
// 4. Temperature
if sysInfo.TemperatureCelsius > 0 {
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
} else {
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
}
}
// 5. Docker health
if err := checkDocker(); err != nil {
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
} else {
report.Info = append(report.Info, "Docker: reachable")
}
// 6. Protected containers
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
for _, name := range missingProtected {
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
}
// Determine status
if len(report.Issues) > 0 {
report.Status = "fail"
} else if len(report.Warnings) > 0 {
report.Status = "warn"
}
return report
}
// FormatMessage returns a human-readable summary for healthcheck ping body.
func (r *HealthReport) FormatMessage() string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status)))
sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05")))
if len(r.Issues) > 0 {
sb.WriteString("ISSUES:\n")
for _, issue := range r.Issues {
sb.WriteString(" - " + issue + "\n")
}
sb.WriteString("\n")
}
if len(r.Warnings) > 0 {
sb.WriteString("WARNINGS:\n")
for _, w := range r.Warnings {
sb.WriteString(" - " + w + "\n")
}
sb.WriteString("\n")
}
if len(r.Info) > 0 {
sb.WriteString("INFO:\n")
for _, info := range r.Info {
sb.WriteString(" - " + info + "\n")
}
}
return sb.String()
}
func checkDocker() error {
cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}")
out, err := cmd.Output()
if err != nil {
return fmt.Errorf("docker not reachable: %v", err)
}
if len(strings.TrimSpace(string(out))) == 0 {
return fmt.Errorf("docker returned empty version")
}
return nil
}
func checkProtectedContainers(protected []string) []string {
var missing []string
for _, name := range protected {
cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name)
out, err := cmd.Output()
if err != nil {
missing = append(missing, name)
continue
}
if strings.TrimSpace(string(out)) != "true" {
missing = append(missing, name)
}
}
return missing
}
+92
View File
@@ -0,0 +1,92 @@
package monitor
import (
"fmt"
"io"
"log"
"net/http"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
)
// Pinger sends health check pings to a Healthchecks.io-compatible server.
type Pinger struct {
baseURL string
httpClient *http.Client
logger *log.Logger
enabled bool
}
// NewPinger creates a new Pinger from monitoring config.
func NewPinger(cfg *config.MonitoringConfig, logger *log.Logger) *Pinger {
return &Pinger{
baseURL: strings.TrimRight(cfg.HealthchecksBase, "/"),
httpClient: &http.Client{
Timeout: 10 * time.Second,
},
logger: logger,
enabled: cfg.Enabled,
}
}
// Ping sends a success signal with optional diagnostic body.
func (p *Pinger) Ping(uuid string, body string) error {
return p.send(uuid, "", body)
}
// Fail sends a failure signal with diagnostic body.
func (p *Pinger) Fail(uuid string, body string) error {
return p.send(uuid, "/fail", body)
}
// Start sends a "job started" signal (for duration tracking).
func (p *Pinger) Start(uuid string) error {
return p.send(uuid, "/start", "")
}
func (p *Pinger) send(uuid, suffix, body string) error {
if !p.enabled {
return nil
}
if uuid == "" || strings.HasPrefix(uuid, "CHANGEME") {
return nil
}
url := fmt.Sprintf("%s/ping/%s%s", p.baseURL, uuid, suffix)
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
if attempt > 0 {
time.Sleep(2 * time.Second)
}
var bodyReader io.Reader
if body != "" {
bodyReader = strings.NewReader(body)
}
req, err := http.NewRequest(http.MethodPost, url, bodyReader)
if err != nil {
lastErr = err
continue
}
resp, err := p.httpClient.Do(req)
if err != nil {
lastErr = err
continue
}
resp.Body.Close()
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
return nil
}
lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
}
p.logger.Printf("[WARN] Health ping failed after 3 attempts (%s): %v", uuid, lastErr)
return nil // Never let ping failures affect the caller
}