diff --git a/controller/internal/backup/backup.go b/controller/internal/backup/backup.go index 739ba82..c6f3051 100644 --- a/controller/internal/backup/backup.go +++ b/controller/internal/backup/backup.go @@ -203,20 +203,22 @@ func (m *Manager) groupStacksByDrive() map[string][]StackSummary { } // activeDrives returns sorted list of drives that have deployed apps. +// Disconnected and decommissioned drives are excluded. func (m *Manager) activeDrives() []string { groups := m.groupStacksByDrive() var drives []string - var disconnected []string + var skipped []string for d := range groups { if m.settings != nil && (m.settings.IsDisconnected(d) || m.settings.IsDecommissioned(d)) { - disconnected = append(disconnected, d) + skipped = append(skipped, d) + continue } drives = append(drives, d) } sort.Strings(drives) if m.isDebug() { - m.logger.Printf("[DEBUG] activeDrives: %d total (%s), %d disconnected/decommissioned", - len(drives), strings.Join(drives, ", "), len(disconnected)) + m.logger.Printf("[DEBUG] activeDrives: %d active (%s), %d skipped (disconnected/decommissioned)", + len(drives), strings.Join(drives, ", "), len(skipped)) } return drives } @@ -1211,11 +1213,10 @@ func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupSta } // No cache yet — return a minimal status (first page load before cache is populated) - return &FullBackupStatus{ + // Deep-copy lastDBDump and lastBackup to prevent callers from mutating shared state. + status := &FullBackupStatus{ Enabled: m.cfg.Backup.Enabled, Running: m.running, - LastDBDump: m.lastDBDump, - LastBackup: m.lastBackup, DBDumpSchedule: m.cfg.Backup.DBDumpSchedule, ResticSchedule: m.cfg.Backup.ResticSchedule, PruneSchedule: m.cfg.Backup.PruneSchedule, @@ -1225,6 +1226,19 @@ func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupSta LastCheckTime: m.lastCheckTime, LastCheckOK: m.lastCheckOK, } + if m.lastDBDump != nil { + copyDump := *m.lastDBDump + if len(m.lastDBDump.Results) > 0 { + copyDump.Results = make([]DumpResult, len(m.lastDBDump.Results)) + copy(copyDump.Results, m.lastDBDump.Results) + } + status.LastDBDump = ©Dump + } + if m.lastBackup != nil { + copyBackup := *m.lastBackup + status.LastBackup = ©Backup + } + return status } // isDebug returns true if logging level is "debug". diff --git a/controller/internal/backup/crossdrive.go b/controller/internal/backup/crossdrive.go index 73f6a82..1af7bf5 100644 --- a/controller/internal/backup/crossdrive.go +++ b/controller/internal/backup/crossdrive.go @@ -372,7 +372,8 @@ func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBa return fmt.Errorf("creating rsync dest dir: %w", err) } - for i, srcMount := range mounts { + seen := make(map[string]bool) + for _, srcMount := range mounts { var dstPath string if len(mounts) == 1 { // Single mount: rsync directly into the stack folder (no extra nesting) @@ -380,13 +381,18 @@ func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBa } else { // Multiple mounts: use the leaf directory name as subfolder leaf := filepath.Base(srcMount) - dstPath = filepath.Join(destDir, leaf) - // Disambiguate duplicate leaf names (e.g. two mounts both named "data") - if i > 0 { - if _, err := os.Stat(dstPath); err == nil { - dstPath = filepath.Join(destDir, fmt.Sprintf("%s_%d", leaf, i)) + if seen[leaf] { + // Disambiguate duplicate leaf names (e.g. two mounts both named "data") + for j := 2; ; j++ { + candidate := fmt.Sprintf("%s_%d", leaf, j) + if !seen[candidate] { + leaf = candidate + break + } } } + seen[leaf] = true + dstPath = filepath.Join(destDir, leaf) } if err := os.MkdirAll(dstPath, 0755); err != nil { return fmt.Errorf("creating rsync destination: %w", err) diff --git a/controller/internal/backup/restic.go b/controller/internal/backup/restic.go index dd9f405..1885fc7 100644 --- a/controller/internal/backup/restic.go +++ b/controller/internal/backup/restic.go @@ -134,12 +134,17 @@ func (r *ResticManager) Snapshot(repoPath string, paths []string, tags []string) cmd := r.command(ctx, repoPath, args...) out, err := cmd.Output() if err != nil { - // Check for stale lock + // Check for stale lock — restic writes lock errors to stderr, not stdout errStr := string(out) + if exitErr, ok := err.(*exec.ExitError); ok { + errStr += string(exitErr.Stderr) + } if strings.Contains(errStr, "lock") || strings.Contains(errStr, "locked") { r.logger.Printf("[WARN] Restic repo locked — attempting unlock") unlockCmd := r.command(ctx, repoPath, "unlock") - unlockCmd.Run() + if unlockErr := unlockCmd.Run(); unlockErr != nil { + r.logger.Printf("[WARN] Restic unlock failed: %v", unlockErr) + } // Retry once cmd = r.command(ctx, repoPath, args...) out, err = cmd.Output() diff --git a/controller/internal/cloudflare/client.go b/controller/internal/cloudflare/client.go index d9d824b..5d36e8a 100644 --- a/controller/internal/cloudflare/client.go +++ b/controller/internal/cloudflare/client.go @@ -2,6 +2,7 @@ package cloudflare import ( "bytes" + "context" "encoding/json" "fmt" "io" @@ -54,7 +55,7 @@ type apiMessage struct { } // do performs an HTTP request to the Cloudflare API and decodes the response. -func (c *Client) do(method, path string, body interface{}) (*apiResponse, error) { +func (c *Client) do(ctx context.Context, method, path string, body interface{}) (*apiResponse, error) { var bodyReader io.Reader if body != nil { data, err := json.Marshal(body) @@ -70,7 +71,7 @@ func (c *Client) do(method, path string, body interface{}) (*apiResponse, error) } url := apiBase + path - req, err := http.NewRequest(method, url, bodyReader) + req, err := http.NewRequestWithContext(ctx, method, url, bodyReader) if err != nil { return nil, fmt.Errorf("create request: %w", err) } diff --git a/controller/internal/cloudflare/geosync.go b/controller/internal/cloudflare/geosync.go index 6af4770..006dcb5 100644 --- a/controller/internal/cloudflare/geosync.go +++ b/controller/internal/cloudflare/geosync.go @@ -76,7 +76,7 @@ func (g *GeoSyncManager) Sync(ctx context.Context) error { zoneID := geo.ZoneID if zoneID == "" { var err error - zoneID, err = g.client.GetZoneID(g.domain) + zoneID, err = g.client.GetZoneID(ctx, g.domain) if err != nil { g.saveError(zoneID, "", err.Error()) return fmt.Errorf("resolve zone: %w", err) @@ -87,13 +87,13 @@ func (g *GeoSyncManager) Sync(ctx context.Context) error { rulesetID := geo.RulesetID if rulesetID == "" { var err error - rulesetID, err = g.client.GetCustomRulesetID(zoneID) + rulesetID, err = g.client.GetCustomRulesetID(ctx, zoneID) if err != nil { g.saveError(zoneID, "", err.Error()) return fmt.Errorf("get ruleset: %w", err) } if rulesetID == "" { - rulesetID, err = g.client.CreateCustomRuleset(zoneID) + rulesetID, err = g.client.CreateCustomRuleset(ctx, zoneID) if err != nil { g.saveError(zoneID, "", err.Error()) return fmt.Errorf("create ruleset: %w", err) @@ -102,7 +102,7 @@ func (g *GeoSyncManager) Sync(ctx context.Context) error { } // 3. List existing felhom-managed rules - existing, err := g.client.GetFelhomRules(zoneID, rulesetID) + existing, err := g.client.GetFelhomRules(ctx, zoneID, rulesetID) if err != nil { g.saveError(zoneID, rulesetID, err.Error()) return fmt.Errorf("list existing rules: %w", err) @@ -112,7 +112,7 @@ func (g *GeoSyncManager) Sync(ctx context.Context) error { desired := g.buildDesiredRules(geo) // 5. Diff and apply - if err := g.applyDiff(zoneID, rulesetID, existing, desired); err != nil { + if err := g.applyDiff(ctx, zoneID, rulesetID, existing, desired); err != nil { g.saveError(zoneID, rulesetID, err.Error()) return fmt.Errorf("apply diff: %w", err) } @@ -138,14 +138,14 @@ func (g *GeoSyncManager) deleteAllRules(ctx context.Context, geo *settings.GeoRe return nil } - existing, err := g.client.GetFelhomRules(zoneID, rulesetID) + existing, err := g.client.GetFelhomRules(ctx, zoneID, rulesetID) if err != nil { g.logger.Printf("[GEO] Warning: could not list rules for cleanup: %v", err) return nil } for _, r := range existing { - if err := g.client.DeleteRule(zoneID, rulesetID, r.ID); err != nil { + if err := g.client.DeleteRule(ctx, zoneID, rulesetID, r.ID); err != nil { g.logger.Printf("[GEO] Warning: could not delete rule %s: %v", r.ID, err) } } @@ -202,7 +202,7 @@ func (g *GeoSyncManager) buildDesiredRules(geo *settings.GeoRestriction) []desir } // applyDiff applies the difference between existing and desired rules. -func (g *GeoSyncManager) applyDiff(zoneID, rulesetID string, existing []GeoRule, desired []desiredRule) error { +func (g *GeoSyncManager) applyDiff(ctx context.Context, zoneID, rulesetID string, existing []GeoRule, desired []desiredRule) error { // Index existing by description existingByDesc := make(map[string]GeoRule) for _, r := range existing { @@ -221,14 +221,14 @@ func (g *GeoSyncManager) applyDiff(zoneID, rulesetID string, existing []GeoRule, // Rule exists — check if expression changed if ex.Expression != d.expression { r := newBlockRule(d.description, d.expression) - if err := g.client.UpdateRule(zoneID, rulesetID, ex.ID, r); err != nil { + if err := g.client.UpdateRule(ctx, zoneID, rulesetID, ex.ID, r); err != nil { return fmt.Errorf("update rule %q: %w", d.description, err) } } } else { // New rule — create r := newBlockRule(d.description, d.expression) - if _, err := g.client.CreateRule(zoneID, rulesetID, r); err != nil { + if _, err := g.client.CreateRule(ctx, zoneID, rulesetID, r); err != nil { return fmt.Errorf("create rule %q: %w", d.description, err) } } @@ -237,7 +237,7 @@ func (g *GeoSyncManager) applyDiff(zoneID, rulesetID string, existing []GeoRule, // Delete rules that are no longer desired for _, ex := range existing { if _, ok := desiredByDesc[ex.Description]; !ok { - if err := g.client.DeleteRule(zoneID, rulesetID, ex.ID); err != nil { + if err := g.client.DeleteRule(ctx, zoneID, rulesetID, ex.ID); err != nil { return fmt.Errorf("delete rule %q: %w", ex.Description, err) } } diff --git a/controller/internal/cloudflare/waf.go b/controller/internal/cloudflare/waf.go index b73029b..d2fcc06 100644 --- a/controller/internal/cloudflare/waf.go +++ b/controller/internal/cloudflare/waf.go @@ -1,6 +1,7 @@ package cloudflare import ( + "context" "encoding/json" "fmt" "strings" @@ -58,9 +59,9 @@ type GeoRule struct { // GetCustomRulesetID returns the zone's http_request_firewall_custom ruleset ID. // Returns empty string if no such ruleset exists yet. -func (c *Client) GetCustomRulesetID(zoneID string) (string, error) { +func (c *Client) GetCustomRulesetID(ctx context.Context, zoneID string) (string, error) { path := fmt.Sprintf("/zones/%s/rulesets", zoneID) - resp, err := c.do("GET", path, nil) + resp, err := c.do(ctx, "GET", path, nil) if err != nil { return "", fmt.Errorf("list rulesets: %w", err) } @@ -80,7 +81,7 @@ func (c *Client) GetCustomRulesetID(zoneID string) (string, error) { } // CreateCustomRuleset creates the http_request_firewall_custom phase entry point ruleset. -func (c *Client) CreateCustomRuleset(zoneID string) (string, error) { +func (c *Client) CreateCustomRuleset(ctx context.Context, zoneID string) (string, error) { path := fmt.Sprintf("/zones/%s/rulesets", zoneID) body := map[string]interface{}{ "name": "felhom custom rules", @@ -89,7 +90,7 @@ func (c *Client) CreateCustomRuleset(zoneID string) (string, error) { "rules": []interface{}{}, } - resp, err := c.do("POST", path, body) + resp, err := c.do(ctx, "POST", path, body) if err != nil { return "", fmt.Errorf("create ruleset: %w", err) } @@ -104,9 +105,9 @@ func (c *Client) CreateCustomRuleset(zoneID string) (string, error) { } // GetRules returns all rules in a ruleset. -func (c *Client) GetRules(zoneID, rulesetID string) ([]rule, error) { +func (c *Client) GetRules(ctx context.Context, zoneID, rulesetID string) ([]rule, error) { path := fmt.Sprintf("/zones/%s/rulesets/%s", zoneID, rulesetID) - resp, err := c.do("GET", path, nil) + resp, err := c.do(ctx, "GET", path, nil) if err != nil { return nil, fmt.Errorf("get ruleset: %w", err) } @@ -122,8 +123,8 @@ func (c *Client) GetRules(zoneID, rulesetID string) ([]rule, error) { } // GetFelhomRules returns only rules with the [felhom-geo] prefix. -func (c *Client) GetFelhomRules(zoneID, rulesetID string) ([]GeoRule, error) { - rules, err := c.GetRules(zoneID, rulesetID) +func (c *Client) GetFelhomRules(ctx context.Context, zoneID, rulesetID string) ([]GeoRule, error) { + rules, err := c.GetRules(ctx, zoneID, rulesetID) if err != nil { return nil, err } @@ -144,9 +145,9 @@ func (c *Client) GetFelhomRules(zoneID, rulesetID string) ([]GeoRule, error) { } // CreateRule adds a new rule to the ruleset. -func (c *Client) CreateRule(zoneID, rulesetID string, r rule) (string, error) { +func (c *Client) CreateRule(ctx context.Context, zoneID, rulesetID string, r rule) (string, error) { path := fmt.Sprintf("/zones/%s/rulesets/%s/rules", zoneID, rulesetID) - resp, err := c.do("POST", path, r) + resp, err := c.do(ctx, "POST", path, r) if err != nil { return "", fmt.Errorf("create rule: %w", err) } @@ -170,9 +171,9 @@ func (c *Client) CreateRule(zoneID, rulesetID string, r rule) (string, error) { } // UpdateRule updates an existing rule in the ruleset. -func (c *Client) UpdateRule(zoneID, rulesetID, ruleID string, r rule) error { +func (c *Client) UpdateRule(ctx context.Context, zoneID, rulesetID, ruleID string, r rule) error { path := fmt.Sprintf("/zones/%s/rulesets/%s/rules/%s", zoneID, rulesetID, ruleID) - _, err := c.do("PATCH", path, r) + _, err := c.do(ctx, "PATCH", path, r) if err != nil { return fmt.Errorf("update rule %s: %w", ruleID, err) } @@ -181,9 +182,9 @@ func (c *Client) UpdateRule(zoneID, rulesetID, ruleID string, r rule) error { } // DeleteRule removes a rule from the ruleset. -func (c *Client) DeleteRule(zoneID, rulesetID, ruleID string) error { +func (c *Client) DeleteRule(ctx context.Context, zoneID, rulesetID, ruleID string) error { path := fmt.Sprintf("/zones/%s/rulesets/%s/rules/%s", zoneID, rulesetID, ruleID) - _, err := c.do("DELETE", path, nil) + _, err := c.do(ctx, "DELETE", path, nil) if err != nil { return fmt.Errorf("delete rule %s: %w", ruleID, err) } diff --git a/controller/internal/cloudflare/zone.go b/controller/internal/cloudflare/zone.go index 1e0270e..d3dced0 100644 --- a/controller/internal/cloudflare/zone.go +++ b/controller/internal/cloudflare/zone.go @@ -1,6 +1,7 @@ package cloudflare import ( + "context" "encoding/json" "fmt" "net/url" @@ -14,9 +15,9 @@ type zone struct { // GetZoneID resolves the Cloudflare zone ID for a domain. // It tries the exact domain first, then strips subdomains progressively. -func (c *Client) GetZoneID(domain string) (string, error) { +func (c *Client) GetZoneID(ctx context.Context, domain string) (string, error) { // Try exact domain first (e.g., "demo-felhom.eu") - id, err := c.lookupZone(domain) + id, err := c.lookupZone(ctx, domain) if err != nil { return "", err } @@ -31,7 +32,7 @@ func (c *Client) GetZoneID(domain string) (string, error) { if parent == "" { break } - id, err = c.lookupZone(parent) + id, err = c.lookupZone(ctx, parent) if err != nil { return "", err } @@ -45,9 +46,9 @@ func (c *Client) GetZoneID(domain string) (string, error) { } // lookupZone queries the CF API for a zone by name. -func (c *Client) lookupZone(name string) (string, error) { +func (c *Client) lookupZone(ctx context.Context, name string) (string, error) { path := "/zones?name=" + url.QueryEscape(name) + "&status=active" - resp, err := c.do("GET", path, nil) + resp, err := c.do(ctx, "GET", path, nil) if err != nil { return "", fmt.Errorf("lookup zone %q: %w", name, err) } diff --git a/controller/internal/crypto/crypto.go b/controller/internal/crypto/crypto.go index 22c9055..22376ac 100644 --- a/controller/internal/crypto/crypto.go +++ b/controller/internal/crypto/crypto.go @@ -7,6 +7,7 @@ import ( "crypto/rand" "encoding/base64" "fmt" + "log" "os" "strings" ) @@ -98,6 +99,7 @@ func IsEncrypted(value string) bool { } // DecryptMap decrypts all encrypted values in a map, returning a new map with plaintext values. +// Logs a warning for any value that fails to decrypt (key rotation, data corruption). func DecryptMap(key []byte, env map[string]string) map[string]string { if key == nil || env == nil { return env @@ -105,10 +107,14 @@ func DecryptMap(key []byte, env map[string]string) map[string]string { result := make(map[string]string, len(env)) for k, v := range env { if IsEncrypted(v) { - if dec, err := Decrypt(key, v); err == nil { - result[k] = dec + dec, err := Decrypt(key, v) + if err != nil { + log.Printf("[WARN] Failed to decrypt env var %q: %v — passing through encrypted value", k, err) + result[k] = v continue } + result[k] = dec + continue } result[k] = v } diff --git a/controller/internal/metrics/collector.go b/controller/internal/metrics/collector.go index 56d911f..45b9b6f 100644 --- a/controller/internal/metrics/collector.go +++ b/controller/internal/metrics/collector.go @@ -7,6 +7,7 @@ import ( "os/exec" "strconv" "strings" + "sync" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/system" @@ -19,6 +20,7 @@ type MetricsCollector struct { hddPath string logger *log.Logger cancel context.CancelFunc + startOnce sync.Once } // NewMetricsCollector creates a new collector. @@ -32,9 +34,12 @@ func NewMetricsCollector(store *MetricsStore, cpuCollector *system.CPUCollector, } // Start begins the background collection loop (every 60 seconds). +// Safe to call multiple times — only the first call starts the loop. func (c *MetricsCollector) Start(ctx context.Context) { - ctx, c.cancel = context.WithCancel(ctx) - go c.loop(ctx) + c.startOnce.Do(func() { + ctx, c.cancel = context.WithCancel(ctx) + go c.loop(ctx) + }) } // Stop cancels the collection loop. diff --git a/controller/internal/monitor/watchdog.go b/controller/internal/monitor/watchdog.go index d7ba8e7..b1694f2 100644 --- a/controller/internal/monitor/watchdog.go +++ b/controller/internal/monitor/watchdog.go @@ -54,6 +54,7 @@ type WatchdogStackProvider interface { // pathProbeState tracks in-memory probe state for a single storage path. type pathProbeState struct { + mu sync.Mutex consecutiveFailures int lastStatus string // "connected", "disconnected" lastProbeTime time.Time @@ -141,10 +142,13 @@ func (w *StorageWatchdog) Check(ctx context.Context) error { state := w.getOrCreateState(sp.Path) // Rate-limit per-path probes + state.mu.Lock() if time.Since(state.lastProbeTime) < state.probeInterval { + state.mu.Unlock() continue } state.lastProbeTime = time.Now() + state.mu.Unlock() // Skip decommissioned drives entirely — no apps reference them if sp.Decommissioned { @@ -186,6 +190,9 @@ func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *p result := system.ProbeStoragePath(sp.Path) probeLatency := time.Since(probeStart) + state.mu.Lock() + defer state.mu.Unlock() + if w.isDebug() { state.probeCount++ state.totalLatency += probeLatency @@ -225,7 +232,9 @@ func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *p sp.Path, state.consecutiveFailures, probeThreshold, result.Err) if state.consecutiveFailures >= probeThreshold { + state.mu.Unlock() w.handleDisconnect(sp, state, result) + state.mu.Lock() // re-acquire for deferred Unlock } } @@ -251,9 +260,11 @@ func (w *StorageWatchdog) handleDisconnect(sp settings.StoragePath, state *pathP } // 4. Update in-memory state + state.mu.Lock() state.lastStatus = "disconnected" state.probeInterval = disconnectedProbeInterval state.consecutiveFailures = 0 + state.mu.Unlock() // 5. Trigger alert refresh if w.alertRefresh != nil { @@ -343,9 +354,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings. // Update in-memory state state := w.getOrCreateState(sp.Path) + state.mu.Lock() state.lastStatus = "connected" state.probeInterval = defaultProbeInterval state.consecutiveFailures = 0 + state.mu.Unlock() // Trigger alert refresh if w.alertRefresh != nil { @@ -551,9 +564,11 @@ func (w *StorageWatchdog) SafeDisconnect(ctx context.Context, path string) (stop // 5. Update in-memory state state := w.getOrCreateState(path) + state.mu.Lock() state.lastStatus = "disconnected" state.probeInterval = disconnectedProbeInterval state.consecutiveFailures = 0 + state.mu.Unlock() // 6. Trigger alert refresh if w.alertRefresh != nil { @@ -624,9 +639,11 @@ func (w *StorageWatchdog) Reconnect(ctx context.Context, path string) (stoppedSt // Update in-memory state state := w.getOrCreateState(path) + state.mu.Lock() state.lastStatus = "connected" state.probeInterval = defaultProbeInterval state.consecutiveFailures = 0 + state.mu.Unlock() // Trigger alert refresh if w.alertRefresh != nil { @@ -720,9 +737,11 @@ func (w *StorageWatchdog) SimulateDisconnect(ctx context.Context, path string) ( // Step 4: Update in-memory state state := w.getOrCreateState(path) + state.mu.Lock() state.lastStatus = "disconnected" state.probeInterval = disconnectedProbeInterval state.consecutiveFailures = 0 + state.mu.Unlock() // Step 5: Trigger alert refresh if w.alertRefresh != nil { @@ -782,9 +801,11 @@ func (w *StorageWatchdog) SimulateReconnect(ctx context.Context, path string) er // Update in-memory state state := w.getOrCreateState(path) + state.mu.Lock() state.lastStatus = "connected" state.probeInterval = defaultProbeInterval state.consecutiveFailures = 0 + state.mu.Unlock() // Trigger alert refresh if w.alertRefresh != nil { @@ -841,6 +862,7 @@ func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus { ds.Simulated = w.isSimulatedLocked(sp.Path) if state, ok := w.pathState[sp.Path]; ok { + state.mu.Lock() ds.DebounceCount = state.consecutiveFailures ds.LastProbe = state.lastProbeTime ds.ProbeOK = state.lastStatus == "connected" @@ -849,6 +871,7 @@ func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus { if state.probeCount > 0 { ds.AvgLatencyMs = float64(state.totalLatency.Milliseconds()) / float64(state.probeCount) } + state.mu.Unlock() } result = append(result, ds) } diff --git a/controller/internal/selftest/selftest.go b/controller/internal/selftest/selftest.go index fe2e33d..697ab2e 100644 --- a/controller/internal/selftest/selftest.go +++ b/controller/internal/selftest/selftest.go @@ -8,6 +8,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/backup" @@ -96,7 +97,7 @@ func checkDockerSocket() CheckResult { if err != nil { return CheckResult{Name: "Docker socket", Status: "fail", Message: fmt.Sprintf("docker info failed: %v", err)} } - return CheckResult{Name: "Docker socket", Status: "pass", Message: fmt.Sprintf("reachable (v%s)", string(out[:len(out)-1]))} + return CheckResult{Name: "Docker socket", Status: "pass", Message: fmt.Sprintf("reachable (v%s)", strings.TrimSpace(string(out)))} } func checkStacksDir(stacksDir string) CheckResult { diff --git a/controller/internal/selfupdate/updater.go b/controller/internal/selfupdate/updater.go index f4256a2..af2d67f 100644 --- a/controller/internal/selfupdate/updater.go +++ b/controller/internal/selfupdate/updater.go @@ -64,6 +64,8 @@ func NewUpdater(cfg *config.SelfUpdateConfig, gitCfg *config.GitConfig, currentV // SetBackupRunningCheck sets the callback to check if a backup is in progress. func (u *Updater) SetBackupRunningCheck(fn func() bool) { + u.mu.Lock() + defer u.mu.Unlock() u.backupRunning = fn } diff --git a/controller/internal/stacks/delete.go b/controller/internal/stacks/delete.go index 5b8b321..e3ea1bd 100644 --- a/controller/internal/stacks/delete.go +++ b/controller/internal/stacks/delete.go @@ -301,8 +301,14 @@ func (m *Manager) RemoveStack(name string, removeHDDData bool, backupPathsToRemo } // Step 5: Handle backup data cleanup + backupsBase := filepath.Join(hddPath, felhomDataDir, "backups") for _, bkPath := range backupPathsToRemove { cleanPath := filepath.Clean(bkPath) + // Validate path is under the expected backups directory + if hddPath == "" || !strings.HasPrefix(cleanPath, backupsBase+string(filepath.Separator)) { + m.logger.Printf("[WARN] Refusing to remove backup path outside expected directory: %s", cleanPath) + continue + } if _, err := os.Stat(cleanPath); os.IsNotExist(err) { continue } diff --git a/controller/internal/stacks/deploy.go b/controller/internal/stacks/deploy.go index d96510b..7c47085 100644 --- a/controller/internal/stacks/deploy.go +++ b/controller/internal/stacks/deploy.go @@ -298,7 +298,7 @@ func (m *Manager) runComposeDeploy(name, stackDir string, env map[string]string, if composeErr != nil { m.logger.Printf("[ERROR] Stack %s deploy failed after %.1fs: %v", name, time.Since(start).Seconds(), composeErr) - // Revert in-memory state + // Revert in-memory and disk state m.mu.Lock() if s, ok := m.stacks[name]; ok { s.Deployed = false @@ -306,10 +306,12 @@ func (m *Manager) runComposeDeploy(name, stackDir string, env map[string]string, s.DeployError = composeErr.Error() s.AppConfig = nil } - m.mu.Unlock() - // Revert disk state — keep app.yaml for debugging but mark as not deployed + // Also revert the shared appCfg under lock (C03 fix) appCfg.Deployed = false - _ = SaveAppConfig(stackDir, appCfg, nil, nil) + m.mu.Unlock() + // Save reverted state to disk with encryption (H05 fix) + meta := LoadMetadata(stackDir) + _ = SaveAppConfig(stackDir, appCfg, m.encKey, SensitiveEnvVars(&meta)) return } @@ -363,8 +365,10 @@ func (m *Manager) UpdateStackConfig(name string, values map[string]string) error return fmt.Errorf("saving updated config: %w", err) } - _, err := m.composeExecWithEnv(stackDir, appCfg.Env, "up", "-d") - if err != nil { + // Use stackEnv which loads decrypted values for docker compose (C01 fix). + // appCfg.Env may contain encrypted values from LoadAppConfig. + env := m.stackEnv(stackDir) + if _, err := m.composeExecCustomEnv(stackDir, env, "up", "-d"); err != nil { return fmt.Errorf("restarting with new config: %w", err) } @@ -552,8 +556,15 @@ func SaveAppConfig(stackDir string, cfg *AppConfig, encKey []byte, sensitiveVars path := filepath.Join(stackDir, "app.yaml") header := "# Auto-generated by felhom-controller — do not edit locked fields manually\n" content := header + string(data) - if err := os.WriteFile(path, []byte(content), 0600); err != nil { - return fmt.Errorf("writing %s: %w", path, err) + + // Atomic write: write to .tmp then rename (H04 fix) + tmpPath := path + ".tmp" + if err := os.WriteFile(tmpPath, []byte(content), 0600); err != nil { + return fmt.Errorf("writing %s: %w", tmpPath, err) + } + if err := os.Rename(tmpPath, path); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("renaming %s to %s: %w", tmpPath, path, err) } return nil } diff --git a/controller/internal/stacks/healthprobe.go b/controller/internal/stacks/healthprobe.go index 7d8b548..d6cf6dd 100644 --- a/controller/internal/stacks/healthprobe.go +++ b/controller/internal/stacks/healthprobe.go @@ -165,7 +165,7 @@ func (m *Manager) runSingleCheck(containerName string, check HealthCheckItem) He // probeTCP tests if a TCP port is reachable on the container. func (m *Manager) probeTCP(containerName string, port int, target string) HealthCheckDetail { start := time.Now() - addr := fmt.Sprintf("%s:%d", containerName, port) + addr := net.JoinHostPort(containerName, fmt.Sprintf("%d", port)) conn, err := net.DialTimeout("tcp", addr, 5*time.Second) latency := time.Since(start) diff --git a/controller/internal/stacks/manager.go b/controller/internal/stacks/manager.go index c82e156..e1d0250 100644 --- a/controller/internal/stacks/manager.go +++ b/controller/internal/stacks/manager.go @@ -112,6 +112,8 @@ func NewManager(cfg *config.Config, logger *log.Logger) (*Manager, error) { // SetEncryptionKey sets the AES-256 key used to encrypt/decrypt sensitive values in app.yaml. func (m *Manager) SetEncryptionKey(key []byte) { + m.mu.Lock() + defer m.mu.Unlock() m.encKey = key } @@ -121,8 +123,8 @@ func (m *Manager) MigrateEncryption() { if m.encKey == nil { return } - m.mu.RLock() - defer m.mu.RUnlock() + m.mu.Lock() + defer m.mu.Unlock() migrated := 0 for _, s := range m.stacks { @@ -446,7 +448,7 @@ func (m *Manager) GetStacks() []Stack { result := make([]Stack, 0, len(m.stacks)) for _, s := range m.stacks { - result = append(result, *s) + result = append(result, deepCopyStack(s)) } // Sort alphabetically by display name for consistent UI ordering @@ -465,8 +467,53 @@ func (m *Manager) GetStack(name string) (*Stack, bool) { if !ok { return nil, false } - copy := *s - return ©, true + cp := deepCopyStack(s) + return &cp, true +} + +// deepCopyStack creates a deep copy of a Stack, including pointer fields. +func deepCopyStack(s *Stack) Stack { + cp := *s + + // Deep-copy Containers slice + if s.Containers != nil { + cp.Containers = make([]ContainerInfo, len(s.Containers)) + copy(cp.Containers, s.Containers) + } + + // Deep-copy AppConfig pointer + if s.AppConfig != nil { + acCopy := *s.AppConfig + if s.AppConfig.Env != nil { + acCopy.Env = make(map[string]string, len(s.AppConfig.Env)) + for k, v := range s.AppConfig.Env { + acCopy.Env[k] = v + } + } + if s.AppConfig.LockedFields != nil { + acCopy.LockedFields = make([]string, len(s.AppConfig.LockedFields)) + copy(acCopy.LockedFields, s.AppConfig.LockedFields) + } + cp.AppConfig = &acCopy + } + + // Deep-copy HealthProbe pointer + if s.HealthProbe != nil { + hpCopy := *s.HealthProbe + if s.HealthProbe.Details != nil { + hpCopy.Details = make([]HealthCheckDetail, len(s.HealthProbe.Details)) + copy(hpCopy.Details, s.HealthProbe.Details) + } + cp.HealthProbe = &hpCopy + } + + // Deep-copy Meta.DeployFields slice + if s.Meta.DeployFields != nil { + cp.Meta.DeployFields = make([]DeployField, len(s.Meta.DeployFields)) + copy(cp.Meta.DeployFields, s.Meta.DeployFields) + } + + return cp } // --- Stack operations --- diff --git a/controller/internal/storage/attach_linux.go b/controller/internal/storage/attach_linux.go index 601b782..c5a1558 100644 --- a/controller/internal/storage/attach_linux.go +++ b/controller/internal/storage/attach_linux.go @@ -55,7 +55,11 @@ func MountRaw(devicePath string) (string, error) { // Choose a directory name: prefer label, fall back to UUID prefix dirName := label if dirName == "" && uuid != "" { - dirName = uuid[:8] // use first 8 chars of UUID + if len(uuid) > 8 { + dirName = uuid[:8] + } else { + dirName = uuid + } } if dirName == "" { dirName = filepath.Base(devicePath) // "sdb1" @@ -441,12 +445,12 @@ func removeBindFstabEntry(fstabPath, targetMountPath string) error { // Remove both the comment line and the bind mount line if strings.Contains(line, "Bind mount (auto-generated by felhom-controller)") { // Check if the next line is the actual bind entry for this target - if i+1 < len(lines) && strings.Contains(lines[i+1], targetMountPath) { + if i+1 < len(lines) && fstabMatchesTarget(lines[i+1], targetMountPath) { i++ // skip the bind line too continue } } - if strings.Contains(line, targetMountPath) && strings.Contains(line, "bind") { + if fstabMatchesTarget(line, targetMountPath) && strings.Contains(line, "bind") { continue } kept = append(kept, line) @@ -454,3 +458,16 @@ func removeBindFstabEntry(fstabPath, targetMountPath string) error { return safeWriteFile(fstabPath, []byte(strings.Join(kept, "\n")), 0644) } + +// fstabMatchesTarget parses an fstab line and checks if the mount target (field 2) matches exactly. +func fstabMatchesTarget(line, target string) bool { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + return false + } + fields := strings.Fields(line) + if len(fields) < 2 { + return false + } + return fields[1] == target +} diff --git a/controller/internal/storage/migrate_drive.go b/controller/internal/storage/migrate_drive.go index ce0e93b..1bfad44 100644 --- a/controller/internal/storage/migrate_drive.go +++ b/controller/internal/storage/migrate_drive.go @@ -312,7 +312,10 @@ func (dm *DriveMigrator) MigrateDrive(ctx context.Context, req DriveMigrateReque }() var stderrBuf strings.Builder + var stderrWg sync.WaitGroup + stderrWg.Add(1) go func() { + defer stderrWg.Done() buf := make([]byte, 4096) for { n, err := stderr.Read(buf) @@ -326,10 +329,12 @@ func (dm *DriveMigrator) MigrateDrive(ctx context.Context, req DriveMigrateReque }() if err := rsyncCmd.Wait(); err != nil { + stderrWg.Wait() send("rolling_back", "rsync sikertelen, visszagörgetés...", 0) tx.rollback() return fail("Adatmásolás sikertelen", fmt.Errorf("rsync failed: %w — %s", err, stderrBuf.String())) } + stderrWg.Wait() // --- Step 3: Verify copy --- send("verifying", "Másolat ellenőrzése...", 62) diff --git a/controller/internal/sync/sync.go b/controller/internal/sync/sync.go index 848fce6..4c95d4b 100644 --- a/controller/internal/sync/sync.go +++ b/controller/internal/sync/sync.go @@ -30,6 +30,7 @@ type Syncer struct { lastErr error syncing bool stopCh chan struct{} + stopOnce sync.Once } // SyncStatus holds information about the last sync operation. @@ -110,9 +111,11 @@ func (s *Syncer) Start() { }() } -// Stop terminates the periodic sync loop. +// Stop terminates the periodic sync loop. Safe to call multiple times. func (s *Syncer) Stop() { - close(s.stopCh) + s.stopOnce.Do(func() { + close(s.stopCh) + }) } // TriggerSync performs an immediate sync. Returns the result. @@ -131,6 +134,7 @@ func (s *Syncer) TriggerSync() SyncResult { s.mu.Unlock() return SyncResult{OK: false, Message: "Túl gyakori szinkronizálás — várj 30 másodpercet"} } + s.syncing = true s.mu.Unlock() return s.doSync() diff --git a/controller/internal/web/csrf.go b/controller/internal/web/csrf.go index 1d87077..78b2c17 100644 --- a/controller/internal/web/csrf.go +++ b/controller/internal/web/csrf.go @@ -33,11 +33,15 @@ func (s *Server) CsrfProtect(next http.Handler) http.Handler { } // Skip CSRF for Bearer-token authenticated requests. - // These endpoints also accept session auth, but when a Bearer token - // is present, the request is from a script/hub, not a browser. + // Validate the token against the configured API key before skipping. if auth := r.Header.Get("Authorization"); strings.HasPrefix(auth, "Bearer ") { - next.ServeHTTP(w, r) - return + token := strings.TrimPrefix(auth, "Bearer ") + apiKey := s.cfg.Hub.APIKey + if apiKey != "" && subtle.ConstantTimeCompare([]byte(token), []byte(apiKey)) == 1 { + next.ServeHTTP(w, r) + return + } + // Invalid Bearer token — fall through to CSRF validation } // Get the session's CSRF token diff --git a/controller/internal/web/handlers.go b/controller/internal/web/handlers.go index 25b21d0..4dbd881 100644 --- a/controller/internal/web/handlers.go +++ b/controller/internal/web/handlers.go @@ -915,7 +915,7 @@ func (s *Server) buildAppBackupRows( } // Destination health check — can downgrade green to yellow/red - if cfg.DestinationPath != "" { + if cfg.DestinationPath != "" && s.crossDriveRunner != nil { if err := s.crossDriveRunner.ValidateDestination(cfg.DestinationPath); err != nil { if strings.Contains(err.Error(), "does not exist") || strings.Contains(err.Error(), "not writable") { row.Status = "red" diff --git a/controller/internal/web/server.go b/controller/internal/web/server.go index 1f4f241..bebbd85 100644 --- a/controller/internal/web/server.go +++ b/controller/internal/web/server.go @@ -1,6 +1,7 @@ package web import ( + "bytes" "fmt" "html/template" "log" @@ -391,11 +392,14 @@ func (s *Server) primaryHDDPath() string { } func (s *Server) render(w http.ResponseWriter, name string, data interface{}) { - w.Header().Set("Content-Type", "text/html; charset=utf-8") - if err := s.tmpl.ExecuteTemplate(w, name, data); err != nil { + var buf bytes.Buffer + if err := s.tmpl.ExecuteTemplate(&buf, name, data); err != nil { s.logger.Printf("[ERROR] Template error (%s): %v", name, err) http.Error(w, "Internal error", http.StatusInternalServerError) + return } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + buf.WriteTo(w) } // executeTemplate renders a template with CSRF data auto-injected into the data map. @@ -406,11 +410,14 @@ func (s *Server) executeTemplate(w http.ResponseWriter, r *http.Request, name st } data["CSRFField"] = s.csrfField(r) data["CSRFToken"] = s.csrfToken(r) - w.Header().Set("Content-Type", "text/html; charset=utf-8") - if err := s.tmpl.ExecuteTemplate(w, name, data); err != nil { + var buf bytes.Buffer + if err := s.tmpl.ExecuteTemplate(&buf, name, data); err != nil { s.logger.Printf("[ERROR] Template error (%s): %v", name, err) http.Error(w, "Internal error", http.StatusInternalServerError) + return } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + buf.WriteTo(w) } // --- Static file / asset serving --- diff --git a/controller/internal/web/storage_handlers.go b/controller/internal/web/storage_handlers.go index bb9ab49..322b3c7 100644 --- a/controller/internal/web/storage_handlers.go +++ b/controller/internal/web/storage_handlers.go @@ -915,22 +915,21 @@ func (s *Server) storageAttachMountRawHandler(w http.ResponseWriter, r *http.Req return } - // Clean up any previous raw mount first + // Hold lock across entire cleanup+mount+set to prevent races s.diskJobMu.Lock() if s.activeRawMount != "" { _ = storage.CleanupRawMount(s.activeRawMount) s.activeRawMount = "" } - s.diskJobMu.Unlock() rawPath, err := storage.MountRaw(req.DevicePath) if err != nil { + s.diskJobMu.Unlock() s.logger.Printf("[ERROR] storageAttachMountRaw: %v", err) jsonError(w, err.Error(), http.StatusInternalServerError) return } - s.diskJobMu.Lock() s.activeRawMount = rawPath s.diskJobMu.Unlock()