v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups
Phase 2 (Monitoring & Health): - Central job scheduler replacing ad-hoc goroutines (internal/scheduler) - CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go) - Temperature reading from /sys/class/thermal + /host/sys (Docker mount) - Load average from /proc/loadavg - Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go) - System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go) Phase 3 (Backups): - Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go) - Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes - Restic backup integration with auto-password generation (internal/backup/restic.go) - Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go) - Manual backup trigger via dashboard button and POST /api/backup/run Dashboard UI: - CPU usage bar with load average display - Temperature with colored indicator dot - Backup status card with last run time, DB count, repo stats - "Mentés most" button for manual backup trigger Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,263 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||
)
|
||||
|
||||
// Manager orchestrates database dumps and restic backups.
|
||||
type Manager struct {
|
||||
cfg *config.Config
|
||||
restic *ResticManager
|
||||
logger *log.Logger
|
||||
pinger *monitor.Pinger
|
||||
|
||||
mu sync.Mutex
|
||||
lastDBDump *DBDumpStatus
|
||||
lastBackup *BackupStatus
|
||||
running bool
|
||||
}
|
||||
|
||||
// DBDumpStatus holds the last DB dump result.
|
||||
type DBDumpStatus struct {
|
||||
LastRun time.Time
|
||||
Results []DumpResult
|
||||
Success bool
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// BackupStatus holds the last backup result.
|
||||
type BackupStatus struct {
|
||||
LastRun time.Time
|
||||
Snapshot *SnapshotResult
|
||||
Success bool
|
||||
Duration time.Duration
|
||||
RepoStats *RepoStats
|
||||
}
|
||||
|
||||
// NewManager creates a new backup manager.
|
||||
func NewManager(cfg *config.Config, pinger *monitor.Pinger, logger *log.Logger) *Manager {
|
||||
return &Manager{
|
||||
cfg: cfg,
|
||||
restic: NewResticManager(cfg, logger),
|
||||
logger: logger,
|
||||
pinger: pinger,
|
||||
}
|
||||
}
|
||||
|
||||
// RunDBDumps discovers and dumps all databases.
|
||||
func (m *Manager) RunDBDumps(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
m.logger.Printf("[INFO] Starting database dump run")
|
||||
|
||||
dbs, err := DiscoverDatabases(ctx, m.logger)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Database discovery failed: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
if len(dbs) == 0 {
|
||||
m.logger.Printf("[INFO] No database containers found")
|
||||
m.mu.Lock()
|
||||
m.lastDBDump = &DBDumpStatus{
|
||||
LastRun: time.Now(),
|
||||
Success: true,
|
||||
Duration: time.Since(start),
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
m.logger.Printf("[INFO] Discovered %d database(s): %s", len(dbs), dbNames(dbs))
|
||||
|
||||
results := DumpAll(ctx, dbs, m.cfg.Paths.DBDumpDir, m.logger)
|
||||
|
||||
// Check results
|
||||
allOK := true
|
||||
var summary []string
|
||||
var totalSize int64
|
||||
for _, r := range results {
|
||||
if r.Error != nil {
|
||||
allOK = false
|
||||
summary = append(summary, fmt.Sprintf("FAIL %s: %v", r.DB.ContainerName, r.Error))
|
||||
m.logger.Printf("[ERROR] DB dump failed for %s: %v", r.DB.ContainerName, r.Error)
|
||||
} else {
|
||||
totalSize += r.Size
|
||||
summary = append(summary, fmt.Sprintf("OK %s (%s)", r.DB.ContainerName, formatBytes(r.Size)))
|
||||
}
|
||||
}
|
||||
|
||||
duration := time.Since(start)
|
||||
m.mu.Lock()
|
||||
m.lastDBDump = &DBDumpStatus{
|
||||
LastRun: time.Now(),
|
||||
Results: results,
|
||||
Success: allOK,
|
||||
Duration: duration,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
// Ping healthcheck
|
||||
uuid := m.cfg.Monitoring.PingUUIDs.DBDump
|
||||
body := fmt.Sprintf("DB dump: %d databases, %s total\n%s",
|
||||
len(results), formatBytes(totalSize), strings.Join(summary, "\n"))
|
||||
|
||||
if allOK {
|
||||
m.pinger.Ping(uuid, body)
|
||||
m.logger.Printf("[INFO] DB dump completed: %d databases, %s total (%s)",
|
||||
len(results), formatBytes(totalSize), duration.Round(time.Millisecond))
|
||||
} else {
|
||||
m.pinger.Fail(uuid, body)
|
||||
return fmt.Errorf("some database dumps failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunBackup runs a restic backup snapshot.
|
||||
func (m *Manager) RunBackup(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
m.logger.Printf("[INFO] Starting restic backup")
|
||||
|
||||
// Ensure repo is initialized
|
||||
if err := m.restic.EnsureInitialized(); err != nil {
|
||||
m.logger.Printf("[ERROR] Restic init failed: %v", err)
|
||||
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Restic init failed: %v", err))
|
||||
return err
|
||||
}
|
||||
|
||||
// Backup paths
|
||||
paths := []string{
|
||||
m.cfg.Paths.StacksDir,
|
||||
m.cfg.Paths.DBDumpDir,
|
||||
"/opt/docker/felhom-controller/controller.yaml",
|
||||
}
|
||||
tags := []string{"felhom", m.cfg.Customer.ID}
|
||||
|
||||
result, err := m.restic.Snapshot(paths, tags)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Restic backup failed: %v", err)
|
||||
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Backup failed: %v", err))
|
||||
|
||||
m.mu.Lock()
|
||||
m.lastBackup = &BackupStatus{
|
||||
LastRun: time.Now(),
|
||||
Success: false,
|
||||
Duration: time.Since(start),
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return err
|
||||
}
|
||||
|
||||
// Prune check (weekly — Sunday)
|
||||
if shouldPrune(m.cfg.Backup.PruneSchedule) {
|
||||
m.logger.Printf("[INFO] Running weekly prune")
|
||||
if err := m.restic.Prune(m.cfg.Backup.Retention); err != nil {
|
||||
m.logger.Printf("[WARN] Restic prune failed: %v", err)
|
||||
}
|
||||
if err := m.restic.Check(); err != nil {
|
||||
m.logger.Printf("[WARN] Restic check failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Get stats
|
||||
stats, _ := m.restic.Stats()
|
||||
|
||||
duration := time.Since(start)
|
||||
m.mu.Lock()
|
||||
m.lastBackup = &BackupStatus{
|
||||
LastRun: time.Now(),
|
||||
Snapshot: result,
|
||||
Success: true,
|
||||
Duration: duration,
|
||||
RepoStats: stats,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
body := fmt.Sprintf("Backup OK\nSnapshot: %s\nNew files: %d, Changed: %d\nData added: %s\nDuration: %s",
|
||||
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
|
||||
duration.Round(time.Second))
|
||||
m.pinger.Ping(m.cfg.Monitoring.PingUUIDs.Backup, body)
|
||||
|
||||
m.logger.Printf("[INFO] Restic backup completed: snapshot %s, %d new, %d changed, %s added (%s)",
|
||||
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
|
||||
duration.Round(time.Millisecond))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunFullBackup runs DB dumps followed by restic backup.
|
||||
func (m *Manager) RunFullBackup(ctx context.Context) error {
|
||||
m.mu.Lock()
|
||||
if m.running {
|
||||
m.mu.Unlock()
|
||||
return fmt.Errorf("backup already in progress")
|
||||
}
|
||||
m.running = true
|
||||
m.mu.Unlock()
|
||||
|
||||
defer func() {
|
||||
m.mu.Lock()
|
||||
m.running = false
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
|
||||
// Step 1: DB dumps
|
||||
if err := m.RunDBDumps(ctx); err != nil {
|
||||
m.logger.Printf("[WARN] DB dump had errors, continuing with backup anyway")
|
||||
}
|
||||
|
||||
// Step 2: Restic backup
|
||||
return m.RunBackup(ctx)
|
||||
}
|
||||
|
||||
// GetStatus returns the current backup status.
|
||||
func (m *Manager) GetStatus() (*DBDumpStatus, *BackupStatus) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.lastDBDump, m.lastBackup
|
||||
}
|
||||
|
||||
// GetRepoStats returns repository statistics.
|
||||
func (m *Manager) GetRepoStats() (*RepoStats, error) {
|
||||
return m.restic.Stats()
|
||||
}
|
||||
|
||||
// IsRunning returns whether a backup is currently in progress.
|
||||
func (m *Manager) IsRunning() bool {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.running
|
||||
}
|
||||
|
||||
func shouldPrune(schedule string) bool {
|
||||
loc, err := time.LoadLocation("Europe/Budapest")
|
||||
if err != nil {
|
||||
loc = time.UTC
|
||||
}
|
||||
now := time.Now().In(loc)
|
||||
|
||||
switch strings.ToLower(schedule) {
|
||||
case "weekly":
|
||||
return now.Weekday() == time.Sunday
|
||||
case "daily":
|
||||
return true
|
||||
default:
|
||||
return now.Weekday() == time.Sunday
|
||||
}
|
||||
}
|
||||
|
||||
func dbNames(dbs []DiscoveredDB) string {
|
||||
var names []string
|
||||
for _, db := range dbs {
|
||||
names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType))
|
||||
}
|
||||
return strings.Join(names, ", ")
|
||||
}
|
||||
@@ -0,0 +1,324 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// DBType represents a database engine type.
|
||||
type DBType string
|
||||
|
||||
const (
|
||||
DBTypePostgres DBType = "postgres"
|
||||
DBTypeMariaDB DBType = "mariadb"
|
||||
)
|
||||
|
||||
// DiscoveredDB holds metadata about a running database container.
|
||||
type DiscoveredDB struct {
|
||||
ContainerName string
|
||||
ContainerID string
|
||||
DBType DBType
|
||||
DBUser string
|
||||
DBName string
|
||||
StackName string
|
||||
}
|
||||
|
||||
// DumpResult holds the outcome of a single database dump.
|
||||
type DumpResult struct {
|
||||
DB DiscoveredDB
|
||||
FilePath string
|
||||
Size int64
|
||||
Duration time.Duration
|
||||
Error error
|
||||
}
|
||||
|
||||
// DiscoverDatabases finds running database containers via docker ps.
|
||||
func DiscoverDatabases(ctx context.Context, logger *log.Logger) ([]DiscoveredDB, error) {
|
||||
cmd := exec.CommandContext(ctx, "docker", "ps", "--format", "{{.ID}}\t{{.Names}}\t{{.Image}}", "--filter", "status=running")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("docker ps failed: %w", err)
|
||||
}
|
||||
|
||||
var dbs []DiscoveredDB
|
||||
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "\t", 3)
|
||||
if len(parts) < 3 {
|
||||
continue
|
||||
}
|
||||
|
||||
id, name, image := parts[0], parts[1], strings.ToLower(parts[2])
|
||||
|
||||
var dbType DBType
|
||||
if strings.Contains(image, "postgres") {
|
||||
dbType = DBTypePostgres
|
||||
} else if strings.Contains(image, "mariadb") || strings.Contains(image, "mysql") {
|
||||
dbType = DBTypeMariaDB
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
|
||||
db := DiscoveredDB{
|
||||
ContainerID: id,
|
||||
ContainerName: name,
|
||||
DBType: dbType,
|
||||
StackName: deriveStackName(name),
|
||||
}
|
||||
|
||||
// Get env vars from container
|
||||
if err := populateDBEnv(ctx, &db); err != nil {
|
||||
logger.Printf("[WARN] Could not read env vars for %s: %v", name, err)
|
||||
continue
|
||||
}
|
||||
|
||||
dbs = append(dbs, db)
|
||||
}
|
||||
|
||||
return dbs, nil
|
||||
}
|
||||
|
||||
// DumpAll dumps all discovered databases.
|
||||
func DumpAll(ctx context.Context, dbs []DiscoveredDB, dumpDir string, logger *log.Logger) []DumpResult {
|
||||
// Clean up old .tmp files (older than 1 hour)
|
||||
cleanupTmpFiles(dumpDir, logger)
|
||||
|
||||
var results []DumpResult
|
||||
for _, db := range dbs {
|
||||
result := DumpOne(ctx, db, dumpDir, logger)
|
||||
results = append(results, result)
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
// DumpOne dumps a single database.
|
||||
func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.Logger) DumpResult {
|
||||
start := time.Now()
|
||||
result := DumpResult{DB: db}
|
||||
|
||||
// Ensure dump directory exists
|
||||
if err := os.MkdirAll(dumpDir, 0755); err != nil {
|
||||
result.Error = fmt.Errorf("creating dump dir: %w", err)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
filename := fmt.Sprintf("%s-%s.sql", db.StackName, db.DBType)
|
||||
tmpPath := filepath.Join(dumpDir, filename+".tmp")
|
||||
finalPath := filepath.Join(dumpDir, filename)
|
||||
|
||||
// 5-minute timeout per dump
|
||||
dumpCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
// Verify container is still running
|
||||
checkCmd := exec.CommandContext(dumpCtx, "docker", "inspect", "--format", "{{.State.Running}}", db.ContainerID)
|
||||
checkOut, err := checkCmd.Output()
|
||||
if err != nil || strings.TrimSpace(string(checkOut)) != "true" {
|
||||
result.Error = fmt.Errorf("container %s no longer running", db.ContainerName)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
// Build dump command
|
||||
var cmd *exec.Cmd
|
||||
switch db.DBType {
|
||||
case DBTypePostgres:
|
||||
cmd = exec.CommandContext(dumpCtx, "docker", "exec", db.ContainerID,
|
||||
"pg_dump", "-U", db.DBUser, "-d", db.DBName,
|
||||
"--clean", "--if-exists", "--no-owner", "--no-privileges")
|
||||
case DBTypeMariaDB:
|
||||
// Get root password from container env
|
||||
password := getMariaDBPassword(dumpCtx, db.ContainerID)
|
||||
if password == "" {
|
||||
result.Error = fmt.Errorf("could not determine MariaDB root password for %s", db.ContainerName)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
cmd = exec.CommandContext(dumpCtx, "docker", "exec", db.ContainerID,
|
||||
"mariadb-dump", "-u", "root", "-p"+password,
|
||||
"--single-transaction", "--routines", "--triggers", db.DBName)
|
||||
default:
|
||||
result.Error = fmt.Errorf("unsupported DB type: %s", db.DBType)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
// Write output to tmp file
|
||||
tmpFile, err := os.Create(tmpPath)
|
||||
if err != nil {
|
||||
result.Error = fmt.Errorf("creating tmp file: %w", err)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
cmd.Stdout = tmpFile
|
||||
var stderr strings.Builder
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
err = cmd.Run()
|
||||
tmpFile.Close()
|
||||
|
||||
if err != nil {
|
||||
os.Remove(tmpPath)
|
||||
errMsg := stderr.String()
|
||||
if len(errMsg) > 200 {
|
||||
errMsg = errMsg[:200]
|
||||
}
|
||||
result.Error = fmt.Errorf("dump failed: %v — %s", err, errMsg)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
// Check file size
|
||||
stat, err := os.Stat(tmpPath)
|
||||
if err != nil || stat.Size() == 0 {
|
||||
os.Remove(tmpPath)
|
||||
result.Error = fmt.Errorf("dump produced empty file for %s", db.ContainerName)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
// Rename tmp to final
|
||||
if err := os.Rename(tmpPath, finalPath); err != nil {
|
||||
os.Remove(tmpPath)
|
||||
result.Error = fmt.Errorf("renaming dump file: %w", err)
|
||||
result.Duration = time.Since(start)
|
||||
return result
|
||||
}
|
||||
|
||||
result.FilePath = finalPath
|
||||
result.Size = stat.Size()
|
||||
result.Duration = time.Since(start)
|
||||
|
||||
logger.Printf("[INFO] DB dump: %s → %s (%s, %s)", db.ContainerName, filename,
|
||||
formatBytes(stat.Size()), result.Duration.Round(time.Millisecond))
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func populateDBEnv(ctx context.Context, db *DiscoveredDB) error {
|
||||
cmd := exec.CommandContext(ctx, "docker", "inspect", db.ContainerID,
|
||||
"--format", "{{range .Config.Env}}{{println .}}{{end}}")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
env := make(map[string]string)
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if idx := strings.IndexByte(line, '='); idx > 0 {
|
||||
env[line[:idx]] = line[idx+1:]
|
||||
}
|
||||
}
|
||||
|
||||
switch db.DBType {
|
||||
case DBTypePostgres:
|
||||
db.DBUser = env["POSTGRES_USER"]
|
||||
if db.DBUser == "" {
|
||||
db.DBUser = "postgres"
|
||||
}
|
||||
db.DBName = env["POSTGRES_DB"]
|
||||
if db.DBName == "" {
|
||||
db.DBName = db.DBUser
|
||||
}
|
||||
case DBTypeMariaDB:
|
||||
db.DBName = env["MYSQL_DATABASE"]
|
||||
if db.DBName == "" {
|
||||
db.DBName = env["MARIADB_DATABASE"]
|
||||
}
|
||||
if db.DBName == "" {
|
||||
db.DBName = "mysql" // fallback to dump all
|
||||
}
|
||||
db.DBUser = "root"
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMariaDBPassword(ctx context.Context, containerID string) string {
|
||||
cmd := exec.CommandContext(ctx, "docker", "inspect", containerID,
|
||||
"--format", "{{range .Config.Env}}{{println .}}{{end}}")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.HasPrefix(line, "MYSQL_ROOT_PASSWORD=") {
|
||||
return strings.TrimPrefix(line, "MYSQL_ROOT_PASSWORD=")
|
||||
}
|
||||
if strings.HasPrefix(line, "MARIADB_ROOT_PASSWORD=") {
|
||||
return strings.TrimPrefix(line, "MARIADB_ROOT_PASSWORD=")
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// deriveStackName strips known DB suffixes from container name.
|
||||
func deriveStackName(containerName string) string {
|
||||
knownSuffixes := []string{"postgres", "db", "mariadb", "mysql", "database", "redis", "cache"}
|
||||
|
||||
parts := strings.Split(containerName, "-")
|
||||
if len(parts) <= 1 {
|
||||
return containerName
|
||||
}
|
||||
|
||||
last := strings.ToLower(parts[len(parts)-1])
|
||||
for _, suffix := range knownSuffixes {
|
||||
if last == suffix {
|
||||
return strings.Join(parts[:len(parts)-1], "-")
|
||||
}
|
||||
}
|
||||
return containerName
|
||||
}
|
||||
|
||||
func cleanupTmpFiles(dumpDir string, logger *log.Logger) {
|
||||
entries, err := os.ReadDir(dumpDir)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
cutoff := time.Now().Add(-1 * time.Hour)
|
||||
for _, e := range entries {
|
||||
if !strings.HasSuffix(e.Name(), ".tmp") {
|
||||
continue
|
||||
}
|
||||
info, err := e.Info()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if info.ModTime().Before(cutoff) {
|
||||
path := filepath.Join(dumpDir, e.Name())
|
||||
os.Remove(path)
|
||||
logger.Printf("[INFO] Cleaned up stale tmp file: %s", e.Name())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func formatBytes(b int64) string {
|
||||
const (
|
||||
kb = 1024
|
||||
mb = 1024 * kb
|
||||
gb = 1024 * mb
|
||||
)
|
||||
switch {
|
||||
case b >= gb:
|
||||
return fmt.Sprintf("%.1f GB", float64(b)/float64(gb))
|
||||
case b >= mb:
|
||||
return fmt.Sprintf("%.1f MB", float64(b)/float64(mb))
|
||||
case b >= kb:
|
||||
return fmt.Sprintf("%.1f KB", float64(b)/float64(kb))
|
||||
default:
|
||||
return fmt.Sprintf("%d B", b)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,316 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
)
|
||||
|
||||
// ResticManager handles restic backup operations.
|
||||
type ResticManager struct {
|
||||
repoPath string
|
||||
passwordFile string
|
||||
logger *log.Logger
|
||||
customerID string
|
||||
cacheDir string
|
||||
}
|
||||
|
||||
// SnapshotResult holds the outcome of a restic backup.
|
||||
type SnapshotResult struct {
|
||||
SnapshotID string
|
||||
FilesNew int
|
||||
FilesChanged int
|
||||
DataAdded string
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// SnapshotInfo holds information about a restic snapshot.
|
||||
type SnapshotInfo struct {
|
||||
ID string `json:"short_id"`
|
||||
Time time.Time `json:"time"`
|
||||
Paths []string `json:"paths"`
|
||||
Tags []string `json:"tags"`
|
||||
}
|
||||
|
||||
// RepoStats holds repository statistics.
|
||||
type RepoStats struct {
|
||||
TotalSize string
|
||||
SnapshotCount int
|
||||
LatestSnapshot *SnapshotInfo
|
||||
}
|
||||
|
||||
// NewResticManager creates a new restic manager.
|
||||
func NewResticManager(cfg *config.Config, logger *log.Logger) *ResticManager {
|
||||
return &ResticManager{
|
||||
repoPath: cfg.Backup.ResticRepo,
|
||||
passwordFile: cfg.Backup.ResticPasswordFile,
|
||||
logger: logger,
|
||||
customerID: cfg.Customer.ID,
|
||||
cacheDir: filepath.Join(cfg.Paths.DataDir, "restic-cache"),
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureInitialized checks if the restic repo exists and initializes it if not.
|
||||
// Also auto-generates the password file if missing.
|
||||
func (r *ResticManager) EnsureInitialized() error {
|
||||
// Ensure password file exists
|
||||
if _, err := os.Stat(r.passwordFile); os.IsNotExist(err) {
|
||||
if err := r.generatePassword(); err != nil {
|
||||
return fmt.Errorf("generating restic password: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure cache dir exists
|
||||
os.MkdirAll(r.cacheDir, 0700)
|
||||
|
||||
// Check if repo is already initialized
|
||||
configPath := filepath.Join(r.repoPath, "config")
|
||||
if _, err := os.Stat(configPath); err == nil {
|
||||
r.logger.Printf("[INFO] Restic repo already initialized at %s", r.repoPath)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure repo directory exists
|
||||
if err := os.MkdirAll(r.repoPath, 0700); err != nil {
|
||||
return fmt.Errorf("creating repo dir: %w", err)
|
||||
}
|
||||
|
||||
// Initialize repo
|
||||
r.logger.Printf("[INFO] Initializing restic repository at %s", r.repoPath)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cmd := r.command(ctx, "init")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("restic init failed: %v — %s", err, truncate(string(out), 200))
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Restic repository initialized successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Snapshot creates a new backup snapshot of the given paths.
|
||||
func (r *ResticManager) Snapshot(paths []string, tags []string) (*SnapshotResult, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
|
||||
args := []string{"backup", "--json"}
|
||||
for _, tag := range tags {
|
||||
args = append(args, "--tag", tag)
|
||||
}
|
||||
args = append(args, "--host", r.customerID)
|
||||
|
||||
// Only include paths that exist
|
||||
var existingPaths []string
|
||||
for _, p := range paths {
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
existingPaths = append(existingPaths, p)
|
||||
} else {
|
||||
r.logger.Printf("[WARN] Backup path does not exist, skipping: %s", p)
|
||||
}
|
||||
}
|
||||
|
||||
if len(existingPaths) == 0 {
|
||||
return nil, fmt.Errorf("no backup paths exist")
|
||||
}
|
||||
args = append(args, existingPaths...)
|
||||
|
||||
cmd := r.command(ctx, args...)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
// Check for stale lock
|
||||
errStr := string(out)
|
||||
if strings.Contains(errStr, "lock") || strings.Contains(errStr, "locked") {
|
||||
r.logger.Printf("[WARN] Restic repo locked — attempting unlock")
|
||||
unlockCmd := r.command(ctx, "unlock")
|
||||
unlockCmd.Run()
|
||||
// Retry once
|
||||
cmd = r.command(ctx, args...)
|
||||
out, err = cmd.Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("restic backup failed after unlock: %v", err)
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("restic backup failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
result := &SnapshotResult{
|
||||
Duration: time.Since(start),
|
||||
}
|
||||
|
||||
// Parse JSON output — look for the summary line
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var msg struct {
|
||||
MessageType string `json:"message_type"`
|
||||
FilesNew int `json:"files_new"`
|
||||
FilesChanged int `json:"files_changed"`
|
||||
DataAdded int64 `json:"data_added"`
|
||||
SnapshotID string `json:"snapshot_id"`
|
||||
}
|
||||
if err := json.Unmarshal([]byte(line), &msg); err != nil {
|
||||
continue
|
||||
}
|
||||
if msg.MessageType == "summary" {
|
||||
result.SnapshotID = msg.SnapshotID
|
||||
result.FilesNew = msg.FilesNew
|
||||
result.FilesChanged = msg.FilesChanged
|
||||
result.DataAdded = formatBytes(msg.DataAdded)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// Prune removes old snapshots according to retention policy.
|
||||
func (r *ResticManager) Prune(retention config.RetentionConfig) error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
args := []string{
|
||||
"forget",
|
||||
"--keep-daily", fmt.Sprintf("%d", retention.KeepDaily),
|
||||
"--keep-weekly", fmt.Sprintf("%d", retention.KeepWeekly),
|
||||
"--keep-monthly", fmt.Sprintf("%d", retention.KeepMonthly),
|
||||
"--prune",
|
||||
}
|
||||
|
||||
cmd := r.command(ctx, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("restic forget/prune failed: %v — %s", err, truncate(string(out), 200))
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Restic prune completed")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check verifies repository integrity.
|
||||
func (r *ResticManager) Check() error {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cmd := r.command(ctx, "check")
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("restic check failed: %v — %s", err, truncate(string(out), 200))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// LatestSnapshot returns the most recent snapshot info.
|
||||
func (r *ResticManager) LatestSnapshot() (*SnapshotInfo, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
cmd := r.command(ctx, "snapshots", "--latest", "1", "--json")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("restic snapshots failed: %v", err)
|
||||
}
|
||||
|
||||
var snapshots []SnapshotInfo
|
||||
if err := json.Unmarshal(out, &snapshots); err != nil {
|
||||
return nil, fmt.Errorf("parsing snapshot JSON: %v", err)
|
||||
}
|
||||
|
||||
if len(snapshots) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &snapshots[0], nil
|
||||
}
|
||||
|
||||
// Stats returns repository statistics.
|
||||
func (r *ResticManager) Stats() (*RepoStats, error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
stats := &RepoStats{}
|
||||
|
||||
// Get repo size
|
||||
cmd := r.command(ctx, "stats", "--json")
|
||||
out, err := cmd.Output()
|
||||
if err == nil {
|
||||
var raw struct {
|
||||
TotalSize uint64 `json:"total_size"`
|
||||
}
|
||||
if json.Unmarshal(out, &raw) == nil {
|
||||
stats.TotalSize = formatBytes(int64(raw.TotalSize))
|
||||
}
|
||||
}
|
||||
|
||||
// Count snapshots
|
||||
cmd = r.command(ctx, "snapshots", "--json")
|
||||
out, err = cmd.Output()
|
||||
if err == nil {
|
||||
var snapshots []SnapshotInfo
|
||||
if json.Unmarshal(out, &snapshots) == nil {
|
||||
stats.SnapshotCount = len(snapshots)
|
||||
if len(snapshots) > 0 {
|
||||
latest := snapshots[len(snapshots)-1]
|
||||
stats.LatestSnapshot = &latest
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func (r *ResticManager) command(ctx context.Context, args ...string) *exec.Cmd {
|
||||
cmd := exec.CommandContext(ctx, "restic", args...)
|
||||
cmd.Env = append(os.Environ(),
|
||||
"RESTIC_REPOSITORY="+r.repoPath,
|
||||
"RESTIC_PASSWORD_FILE="+r.passwordFile,
|
||||
"RESTIC_CACHE_DIR="+r.cacheDir,
|
||||
)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (r *ResticManager) generatePassword() error {
|
||||
// Ensure directory exists
|
||||
dir := filepath.Dir(r.passwordFile)
|
||||
if err := os.MkdirAll(dir, 0700); err != nil {
|
||||
return fmt.Errorf("creating password dir: %w", err)
|
||||
}
|
||||
|
||||
// Generate 32 random bytes, base64url-encode
|
||||
b := make([]byte, 32)
|
||||
if _, err := rand.Read(b); err != nil {
|
||||
return fmt.Errorf("generating random bytes: %w", err)
|
||||
}
|
||||
password := base64.URLEncoding.EncodeToString(b)
|
||||
|
||||
if err := os.WriteFile(r.passwordFile, []byte(password), 0600); err != nil {
|
||||
return fmt.Errorf("writing password file: %w", err)
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Generated new restic repository password at %s", r.passwordFile)
|
||||
r.logger.Printf("[WARN] Save this password externally — losing it means losing access to ALL backups")
|
||||
return nil
|
||||
}
|
||||
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen] + "..."
|
||||
}
|
||||
Reference in New Issue
Block a user