v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups
Phase 2 (Monitoring & Health): - Central job scheduler replacing ad-hoc goroutines (internal/scheduler) - CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go) - Temperature reading from /sys/class/thermal + /host/sys (Docker mount) - Load average from /proc/loadavg - Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go) - System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go) Phase 3 (Backups): - Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go) - Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes - Restic backup integration with auto-password generation (internal/backup/restic.go) - Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go) - Manual backup trigger via dashboard button and POST /api/backup/run Dashboard UI: - CPU usage bar with load average display - Temperature with colored indicator dot - Backup status card with last run time, DB count, repo stats - "Mentés most" button for manual backup trigger Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,316 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
)
|
||||
|
||||
// ResticManager handles restic backup operations.
|
||||
type ResticManager struct {
|
||||
repoPath string
|
||||
passwordFile string
|
||||
logger *log.Logger
|
||||
customerID string
|
||||
cacheDir string
|
||||
}
|
||||
|
||||
// SnapshotResult holds the outcome of a restic backup.
|
||||
type SnapshotResult struct {
|
||||
SnapshotID string
|
||||
FilesNew int
|
||||
FilesChanged int
|
||||
DataAdded string
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// SnapshotInfo holds information about a restic snapshot.
|
||||
type SnapshotInfo struct {
|
||||
ID string `json:"short_id"`
|
||||
Time time.Time `json:"time"`
|
||||
Paths []string `json:"paths"`
|
||||
Tags []string `json:"tags"`
|
||||
}
|
||||
|
||||
// RepoStats holds repository statistics.
|
||||
type RepoStats struct {
|
||||
TotalSize string
|
||||
SnapshotCount int
|
||||
LatestSnapshot *SnapshotInfo
|
||||
}
|
||||
|
||||
// NewResticManager creates a new restic manager.
|
||||
func NewResticManager(cfg *config.Config, logger *log.Logger) *ResticManager {
|
||||
return &ResticManager{
|
||||
repoPath: cfg.Backup.ResticRepo,
|
||||
passwordFile: cfg.Backup.ResticPasswordFile,
|
||||
logger: logger,
|
||||
customerID: cfg.Customer.ID,
|
||||
cacheDir: filepath.Join(cfg.Paths.DataDir, "restic-cache"),
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureInitialized checks if the restic repo exists and initializes it if not.
|
||||
// Also auto-generates the password file if missing.
|
||||
func (r *ResticManager) EnsureInitialized() error {
|
||||
// Ensure password file exists
|
||||
if _, err := os.Stat(r.passwordFile); os.IsNotExist(err) {
|
||||
if err := r.generatePassword(); err != nil {
|
||||
return fmt.Errorf("generating restic password: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure cache dir exists
|
||||
os.MkdirAll(r.cacheDir, 0700)
|
||||
|
||||
// Check if repo is already initialized
|
||||
configPath := filepath.Join(r.repoPath, "config")
|
||||
if _, err := os.Stat(configPath); err == nil {
|
||||
r.logger.Printf("[INFO] Restic repo already initialized at %s", r.repoPath)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure repo directory exists
|
||||
if err := os.MkdirAll(r.repoPath, 0700); err != nil {
|
||||
return fmt.Errorf("creating repo dir: %w", err)
|
||||
}
|
||||
|
||||
// Initialize repo
|
||||
r.logger.Printf("[INFO] Initializing restic repository at %s", r.repoPath)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cmd := r.command(ctx, "init")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("restic init failed: %v — %s", err, truncate(string(out), 200))
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Restic repository initialized successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Snapshot creates a new backup snapshot of the given paths.
|
||||
func (r *ResticManager) Snapshot(paths []string, tags []string) (*SnapshotResult, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
|
||||
args := []string{"backup", "--json"}
|
||||
for _, tag := range tags {
|
||||
args = append(args, "--tag", tag)
|
||||
}
|
||||
args = append(args, "--host", r.customerID)
|
||||
|
||||
// Only include paths that exist
|
||||
var existingPaths []string
|
||||
for _, p := range paths {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
existingPaths = append(existingPaths, p)
|
||||
} else {
|
||||
r.logger.Printf("[WARN] Backup path does not exist, skipping: %s", p)
|
||||
}
|
||||
}
|
||||
|
||||
if len(existingPaths) == 0 {
|
||||
return nil, fmt.Errorf("no backup paths exist")
|
||||
}
|
||||
args = append(args, existingPaths...)
|
||||
|
||||
cmd := r.command(ctx, args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
// Check for stale lock
|
||||
errStr := string(out)
|
||||
if strings.Contains(errStr, "lock") || strings.Contains(errStr, "locked") {
|
||||
r.logger.Printf("[WARN] Restic repo locked — attempting unlock")
|
||||
unlockCmd := r.command(ctx, "unlock")
|
||||
unlockCmd.Run()
|
||||
// Retry once
|
||||
cmd = r.command(ctx, args...)
|
||||
out, err = cmd.Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("restic backup failed after unlock: %v", err)
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("restic backup failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
result := &SnapshotResult{
|
||||
Duration: time.Since(start),
|
||||
}
|
||||
|
||||
// Parse JSON output — look for the summary line
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var msg struct {
|
||||
MessageType string `json:"message_type"`
|
||||
FilesNew int `json:"files_new"`
|
||||
FilesChanged int `json:"files_changed"`
|
||||
DataAdded int64 `json:"data_added"`
|
||||
SnapshotID string `json:"snapshot_id"`
|
||||
}
|
||||
if err := json.Unmarshal([]byte(line), &msg); err != nil {
|
||||
continue
|
||||
}
|
||||
if msg.MessageType == "summary" {
|
||||
result.SnapshotID = msg.SnapshotID
|
||||
result.FilesNew = msg.FilesNew
|
||||
result.FilesChanged = msg.FilesChanged
|
||||
result.DataAdded = formatBytes(msg.DataAdded)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Prune removes old snapshots according to retention policy.
|
||||
func (r *ResticManager) Prune(retention config.RetentionConfig) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
args := []string{
|
||||
"forget",
|
||||
"--keep-daily", fmt.Sprintf("%d", retention.KeepDaily),
|
||||
"--keep-weekly", fmt.Sprintf("%d", retention.KeepWeekly),
|
||||
"--keep-monthly", fmt.Sprintf("%d", retention.KeepMonthly),
|
||||
"--prune",
|
||||
}
|
||||
|
||||
cmd := r.command(ctx, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("restic forget/prune failed: %v — %s", err, truncate(string(out), 200))
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Restic prune completed")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check verifies repository integrity.
|
||||
func (r *ResticManager) Check() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cmd := r.command(ctx, "check")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("restic check failed: %v — %s", err, truncate(string(out), 200))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LatestSnapshot returns the most recent snapshot info.
|
||||
func (r *ResticManager) LatestSnapshot() (*SnapshotInfo, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cmd := r.command(ctx, "snapshots", "--latest", "1", "--json")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("restic snapshots failed: %v", err)
|
||||
}
|
||||
|
||||
var snapshots []SnapshotInfo
|
||||
if err := json.Unmarshal(out, &snapshots); err != nil {
|
||||
return nil, fmt.Errorf("parsing snapshot JSON: %v", err)
|
||||
}
|
||||
|
||||
if len(snapshots) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &snapshots[0], nil
|
||||
}
|
||||
|
||||
// Stats returns repository statistics.
|
||||
func (r *ResticManager) Stats() (*RepoStats, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
stats := &RepoStats{}
|
||||
|
||||
// Get repo size
|
||||
cmd := r.command(ctx, "stats", "--json")
|
||||
out, err := cmd.Output()
|
||||
if err == nil {
|
||||
var raw struct {
|
||||
TotalSize uint64 `json:"total_size"`
|
||||
}
|
||||
if json.Unmarshal(out, &raw) == nil {
|
||||
stats.TotalSize = formatBytes(int64(raw.TotalSize))
|
||||
}
|
||||
}
|
||||
|
||||
// Count snapshots
|
||||
cmd = r.command(ctx, "snapshots", "--json")
|
||||
out, err = cmd.Output()
|
||||
if err == nil {
|
||||
var snapshots []SnapshotInfo
|
||||
if json.Unmarshal(out, &snapshots) == nil {
|
||||
stats.SnapshotCount = len(snapshots)
|
||||
if len(snapshots) > 0 {
|
||||
latest := snapshots[len(snapshots)-1]
|
||||
stats.LatestSnapshot = &latest
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (r *ResticManager) command(ctx context.Context, args ...string) *exec.Cmd {
|
||||
cmd := exec.CommandContext(ctx, "restic", args...)
|
||||
cmd.Env = append(os.Environ(),
|
||||
"RESTIC_REPOSITORY="+r.repoPath,
|
||||
"RESTIC_PASSWORD_FILE="+r.passwordFile,
|
||||
"RESTIC_CACHE_DIR="+r.cacheDir,
|
||||
)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (r *ResticManager) generatePassword() error {
|
||||
// Ensure directory exists
|
||||
dir := filepath.Dir(r.passwordFile)
|
||||
if err := os.MkdirAll(dir, 0700); err != nil {
|
||||
return fmt.Errorf("creating password dir: %w", err)
|
||||
}
|
||||
|
||||
// Generate 32 random bytes, base64url-encode
|
||||
b := make([]byte, 32)
|
||||
if _, err := rand.Read(b); err != nil {
|
||||
return fmt.Errorf("generating random bytes: %w", err)
|
||||
}
|
||||
password := base64.URLEncoding.EncodeToString(b)
|
||||
|
||||
if err := os.WriteFile(r.passwordFile, []byte(password), 0600); err != nil {
|
||||
return fmt.Errorf("writing password file: %w", err)
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Generated new restic repository password at %s", r.passwordFile)
|
||||
r.logger.Printf("[WARN] Save this password externally — losing it means losing access to ALL backups")
|
||||
return nil
|
||||
}
|
||||
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen] + "..."
|
||||
}
|
||||
Reference in New Issue
Block a user