8e61cd7ec4
Add structured operational logging at INFO, WARN, and ERROR levels to every controller module. Standardize custom prefixes ([GEO], [SCHED], [SYNC]) to use [INFO/WARN/ERROR] [module] format. Fix misleveled logs (WARN->ERROR for data loss scenarios, WARN->INFO for routine operations). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
369 lines
11 KiB
Go
369 lines
11 KiB
Go
package backup
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// MaxSchemaVersion is the highest infra backup schema version this controller can read.
|
|
const MaxSchemaVersion = 1
|
|
|
|
// maxLocalHistory is the number of previous backup versions to keep per drive.
|
|
const maxLocalHistory = 5
|
|
|
|
// InfraMetadata is the lightweight metadata file written alongside backup.json.
|
|
type InfraMetadata struct {
|
|
SchemaVersion int `json:"schema_version"`
|
|
Timestamp string `json:"timestamp"`
|
|
CustomerID string `json:"customer_id"`
|
|
ControllerVersion string `json:"controller_version"`
|
|
Checksum string `json:"checksum"` // SHA256 hex of backup.json
|
|
}
|
|
|
|
// WriteLocalInfraBackup writes the infra backup to .felhom-infra-backup/ on each drive.
|
|
// Individual drive failures are logged but not returned — the function is best-effort.
|
|
func WriteLocalInfraBackup(backupJSON []byte, customerID, controllerVersion, timestamp string, drives []string, logger *log.Logger, debug bool) {
|
|
if len(drives) == 0 {
|
|
logger.Printf("[DEBUG] No drives configured for local infra backup")
|
|
return
|
|
}
|
|
|
|
if debug {
|
|
logger.Printf("[DEBUG] WriteLocalInfraBackup: payload size=%d bytes, %d target drive(s): %v", len(backupJSON), len(drives), drives)
|
|
}
|
|
|
|
// Compute checksum of backup data
|
|
hash := sha256.Sum256(backupJSON)
|
|
checksum := hex.EncodeToString(hash[:])
|
|
|
|
meta := InfraMetadata{
|
|
SchemaVersion: 1,
|
|
Timestamp: timestamp,
|
|
CustomerID: customerID,
|
|
ControllerVersion: controllerVersion,
|
|
Checksum: checksum,
|
|
}
|
|
|
|
metaJSON, err := json.MarshalIndent(meta, "", " ")
|
|
if err != nil {
|
|
logger.Printf("[ERROR] Local infra backup: failed to marshal metadata: %v", err)
|
|
return
|
|
}
|
|
|
|
written := 0
|
|
for _, drive := range drives {
|
|
dir := InfraBackupDir(drive)
|
|
if debug {
|
|
logger.Printf("[DEBUG] WriteLocalInfraBackup: writing to drive=%s, dir=%s", drive, dir)
|
|
}
|
|
if err := writeInfraToDir(dir, backupJSON, metaJSON, logger); err != nil {
|
|
logger.Printf("[WARN] Local infra backup: failed to write to %s: %v", drive, err)
|
|
continue
|
|
}
|
|
if debug {
|
|
logger.Printf("[DEBUG] WriteLocalInfraBackup: write OK to %s", drive)
|
|
}
|
|
written++
|
|
}
|
|
|
|
logger.Printf("[INFO] Local infra backup written to %d/%d drive(s)", written, len(drives))
|
|
}
|
|
|
|
// writeInfraToDir rotates the current backup into history/ then writes new backup.json and metadata.json.
|
|
func writeInfraToDir(dir string, backupData, metaData []byte, logger *log.Logger) error {
|
|
if err := os.MkdirAll(dir, 0700); err != nil {
|
|
return fmt.Errorf("creating dir: %w", err)
|
|
}
|
|
|
|
// Rotate current backup to history (best-effort)
|
|
rotateToHistory(dir, logger)
|
|
|
|
// Write backup.json atomically
|
|
backupPath := filepath.Join(dir, "backup.json")
|
|
if err := atomicWrite(backupPath, backupData, 0600); err != nil {
|
|
return fmt.Errorf("writing backup.json: %w", err)
|
|
}
|
|
|
|
// Write metadata.json atomically
|
|
metaPath := filepath.Join(dir, "metadata.json")
|
|
if err := atomicWrite(metaPath, metaData, 0600); err != nil {
|
|
return fmt.Errorf("writing metadata.json: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// rotateToHistory moves the current backup.json + metadata.json into history/{timestamp}-*.
|
|
func rotateToHistory(dir string, logger *log.Logger) {
|
|
metaPath := filepath.Join(dir, "metadata.json")
|
|
backupPath := filepath.Join(dir, "backup.json")
|
|
|
|
// Read current metadata to get timestamp
|
|
metaData, err := os.ReadFile(metaPath)
|
|
if err != nil {
|
|
return // no existing backup to rotate
|
|
}
|
|
|
|
var meta InfraMetadata
|
|
if err := json.Unmarshal(metaData, &meta); err != nil {
|
|
return
|
|
}
|
|
|
|
// Parse timestamp, fall back to file mtime
|
|
ts := sanitizeTimestamp(meta.Timestamp)
|
|
if ts == "" {
|
|
if fi, err := os.Stat(metaPath); err == nil {
|
|
ts = fi.ModTime().UTC().Format("20060102T150405Z")
|
|
} else {
|
|
ts = time.Now().UTC().Format("20060102T150405Z")
|
|
}
|
|
}
|
|
|
|
histDir := filepath.Join(dir, "history")
|
|
if err := os.MkdirAll(histDir, 0700); err != nil {
|
|
if logger != nil {
|
|
logger.Printf("[WARN] Local infra history: cannot create history dir: %v", err)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Move files
|
|
histBackup := filepath.Join(histDir, ts+"-backup.json")
|
|
histMeta := filepath.Join(histDir, ts+"-metadata.json")
|
|
|
|
// Copy rather than rename to avoid cross-device issues
|
|
if data, err := os.ReadFile(backupPath); err == nil {
|
|
os.WriteFile(histBackup, data, 0600) //nolint:errcheck
|
|
}
|
|
os.WriteFile(histMeta, metaData, 0600) //nolint:errcheck
|
|
|
|
// Prune old history entries
|
|
pruneLocalHistory(histDir, maxLocalHistory, logger)
|
|
}
|
|
|
|
// pruneLocalHistory keeps at most maxKeep metadata+backup pairs, deleting the oldest.
|
|
func pruneLocalHistory(histDir string, maxKeep int, logger *log.Logger) {
|
|
entries, err := os.ReadDir(histDir)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// Collect unique timestamps (each has -backup.json and -metadata.json)
|
|
timestamps := make(map[string]bool)
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if strings.HasSuffix(name, "-metadata.json") {
|
|
ts := strings.TrimSuffix(name, "-metadata.json")
|
|
timestamps[ts] = true
|
|
}
|
|
}
|
|
|
|
if len(timestamps) <= maxKeep {
|
|
return
|
|
}
|
|
|
|
// Sort timestamps ascending (oldest first)
|
|
sorted := make([]string, 0, len(timestamps))
|
|
for ts := range timestamps {
|
|
sorted = append(sorted, ts)
|
|
}
|
|
sort.Strings(sorted)
|
|
|
|
// Delete oldest entries beyond limit
|
|
toDelete := len(sorted) - maxKeep
|
|
for i := 0; i < toDelete; i++ {
|
|
ts := sorted[i]
|
|
os.Remove(filepath.Join(histDir, ts+"-backup.json"))
|
|
os.Remove(filepath.Join(histDir, ts+"-metadata.json"))
|
|
if logger != nil {
|
|
logger.Printf("[DEBUG] Local infra history: pruned old version %s", ts)
|
|
}
|
|
}
|
|
if logger != nil && toDelete > 0 {
|
|
logger.Printf("[INFO] [backup] Pruning old backup versions: kept %d, removed %d", maxKeep, toDelete)
|
|
}
|
|
}
|
|
|
|
// sanitizeTimestamp converts an RFC3339 timestamp to a filename-safe format.
|
|
func sanitizeTimestamp(ts string) string {
|
|
t, err := time.Parse(time.RFC3339, ts)
|
|
if err != nil {
|
|
t, err = time.Parse(time.RFC3339Nano, ts)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
}
|
|
return t.UTC().Format("20060102T150405Z")
|
|
}
|
|
|
|
// atomicWrite writes data to a .tmp file then renames to the target path.
|
|
func atomicWrite(path string, data []byte, perm os.FileMode) error {
|
|
tmp := path + ".tmp"
|
|
if err := os.WriteFile(tmp, data, perm); err != nil {
|
|
os.Remove(tmp)
|
|
return err
|
|
}
|
|
if err := os.Rename(tmp, path); err != nil {
|
|
os.Remove(tmp)
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ReadLocalInfraBackup reads and validates an infra backup from a mount point.
|
|
// Returns the raw backup JSON, metadata, and any error.
|
|
func ReadLocalInfraBackup(mountPath string) ([]byte, *InfraMetadata, error) {
|
|
dir := InfraBackupDir(mountPath)
|
|
return readInfraBackupFromDir(dir)
|
|
}
|
|
|
|
// ReadLocalInfraBackupFromHistory reads a specific historical version by its timestamp prefix.
|
|
func ReadLocalInfraBackupFromHistory(mountPath, historyPrefix string) ([]byte, *InfraMetadata, error) {
|
|
histDir := InfraBackupHistoryDir(mountPath)
|
|
|
|
metaPath := filepath.Join(histDir, historyPrefix+"-metadata.json")
|
|
backupPath := filepath.Join(histDir, historyPrefix+"-backup.json")
|
|
|
|
return readInfraBackupFromFiles(backupPath, metaPath)
|
|
}
|
|
|
|
// LocalBackupVersion holds summary info for a historical backup version found on a drive.
|
|
type LocalBackupVersion struct {
|
|
Timestamp string `json:"timestamp"`
|
|
CustomerID string `json:"customer_id"`
|
|
ControllerVersion string `json:"controller_version"`
|
|
IntegrityOK bool `json:"integrity_ok"`
|
|
Error string `json:"error,omitempty"`
|
|
StackCount int `json:"stack_count"`
|
|
StackNames []string `json:"stack_names,omitempty"`
|
|
DiskCount int `json:"disk_count"`
|
|
HistoryFile string `json:"history_file,omitempty"` // empty = current, timestamp prefix for history
|
|
}
|
|
|
|
// ReadLocalInfraHistory reads all historical backup versions from a mount point's history/ directory.
|
|
// Returns newest-first. Does NOT include the current backup (use ReadLocalInfraBackup for that).
|
|
func ReadLocalInfraHistory(mountPath string) []LocalBackupVersion {
|
|
histDir := InfraBackupHistoryDir(mountPath)
|
|
entries, err := os.ReadDir(histDir)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
// Collect unique timestamps
|
|
var timestamps []string
|
|
seen := make(map[string]bool)
|
|
for _, e := range entries {
|
|
name := e.Name()
|
|
if strings.HasSuffix(name, "-metadata.json") {
|
|
ts := strings.TrimSuffix(name, "-metadata.json")
|
|
if !seen[ts] {
|
|
seen[ts] = true
|
|
timestamps = append(timestamps, ts)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort descending (newest first)
|
|
sort.Sort(sort.Reverse(sort.StringSlice(timestamps)))
|
|
|
|
var versions []LocalBackupVersion
|
|
for _, ts := range timestamps {
|
|
v := LocalBackupVersion{HistoryFile: ts}
|
|
|
|
backupPath := filepath.Join(histDir, ts+"-backup.json")
|
|
metaPath := filepath.Join(histDir, ts+"-metadata.json")
|
|
|
|
backupData, meta, err := readInfraBackupFromFiles(backupPath, metaPath)
|
|
if meta != nil {
|
|
v.Timestamp = meta.Timestamp
|
|
v.CustomerID = meta.CustomerID
|
|
v.ControllerVersion = meta.ControllerVersion
|
|
}
|
|
if err != nil {
|
|
v.IntegrityOK = false
|
|
v.Error = err.Error()
|
|
} else {
|
|
v.IntegrityOK = true
|
|
ParseBackupCounts(backupData, &v.StackCount, &v.StackNames, &v.DiskCount)
|
|
}
|
|
|
|
versions = append(versions, v)
|
|
}
|
|
|
|
return versions
|
|
}
|
|
|
|
// ParseBackupCounts extracts stack/disk counts from backup JSON (for display purposes).
|
|
func ParseBackupCounts(backupJSON []byte, stackCount *int, stackNames *[]string, diskCount *int) {
|
|
var parsed struct {
|
|
DeployedStacks []struct {
|
|
Name string `json:"name"`
|
|
DisplayName string `json:"display_name"`
|
|
} `json:"deployed_stacks"`
|
|
DiskLayout struct {
|
|
Mounts []json.RawMessage `json:"mounts"`
|
|
} `json:"disk_layout"`
|
|
}
|
|
if err := json.Unmarshal(backupJSON, &parsed); err != nil {
|
|
return
|
|
}
|
|
*stackCount = len(parsed.DeployedStacks)
|
|
*diskCount = len(parsed.DiskLayout.Mounts)
|
|
if stackNames != nil {
|
|
for _, s := range parsed.DeployedStacks {
|
|
name := s.DisplayName
|
|
if name == "" {
|
|
name = s.Name
|
|
}
|
|
*stackNames = append(*stackNames, name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func readInfraBackupFromDir(dir string) ([]byte, *InfraMetadata, error) {
|
|
metaPath := filepath.Join(dir, "metadata.json")
|
|
backupPath := filepath.Join(dir, "backup.json")
|
|
return readInfraBackupFromFiles(backupPath, metaPath)
|
|
}
|
|
|
|
func readInfraBackupFromFiles(backupPath, metaPath string) ([]byte, *InfraMetadata, error) {
|
|
// Read metadata
|
|
metaData, err := os.ReadFile(metaPath)
|
|
if err != nil {
|
|
return nil, nil, fmt.Errorf("reading metadata.json: %w", err)
|
|
}
|
|
|
|
var meta InfraMetadata
|
|
if err := json.Unmarshal(metaData, &meta); err != nil {
|
|
return nil, nil, fmt.Errorf("parsing metadata.json: %w", err)
|
|
}
|
|
|
|
// Check schema version
|
|
if meta.SchemaVersion > MaxSchemaVersion {
|
|
return nil, &meta, fmt.Errorf("backup schema version %d is newer than supported version %d — upgrade the controller", meta.SchemaVersion, MaxSchemaVersion)
|
|
}
|
|
|
|
// Read backup data
|
|
backupData, err := os.ReadFile(backupPath)
|
|
if err != nil {
|
|
return nil, &meta, fmt.Errorf("reading backup.json: %w", err)
|
|
}
|
|
|
|
// Verify checksum
|
|
hash := sha256.Sum256(backupData)
|
|
actual := hex.EncodeToString(hash[:])
|
|
if actual != meta.Checksum {
|
|
return nil, &meta, fmt.Errorf("checksum mismatch: expected %s, got %s", meta.Checksum, actual)
|
|
}
|
|
|
|
return backupData, &meta, nil
|
|
}
|