Fix bugs from BUGHUNT.md: restore race conditions, infra backup, DR wiring, docker-setup.sh, restore.html

This commit is contained in:
2026-02-19 14:06:42 +01:00
parent cdaa137118
commit 75ea9d73f0
7 changed files with 1058 additions and 68 deletions
+20 -12
View File
@@ -76,10 +76,7 @@ func main() {
logger.Printf("[INFO] Found infra backup on Hub: %s (%s), %d stacks, synced %s",
ib.Domain, ib.CustomerID, len(ib.DeployedStacks), ib.Timestamp)
// Restore restic passwords
restorePasswordsFromHub(ib, cfg, sett, logger)
// Restore settings.json from Hub backup
// Restore settings.json from Hub backup first
restoreSettingsFromHub(ib, cfg, logger)
// Re-load settings (now from restored file)
@@ -88,6 +85,9 @@ func main() {
logger.Println("[INFO] Settings reloaded after Hub restore")
}
// Restore restic passwords AFTER settings reload so cross-drive password persists
restorePasswordsFromHub(ib, cfg, sett, logger)
// Mount drives using stored disk layout
mountCtx, mountCancel := context.WithTimeout(context.Background(), 2*time.Minute)
mountedPaths, mountErr := backup.MountDrivesFromLayout(mountCtx, ib.DiskLayout, logger)
@@ -122,11 +122,14 @@ func main() {
}
restorePlan = backup.ScanDrivesForBackups(drivePaths, infraStacks, logger)
restorePlan.CustomerID = ib.CustomerID
restorePlan.Domain = ib.Domain
restorePlan.Timestamp = ib.Timestamp
logger.Printf("[INFO] DR restore plan ready: %d apps to restore", len(restorePlan.Apps))
if restorePlan != nil {
restorePlan.CustomerID = ib.CustomerID
restorePlan.Domain = ib.Domain
restorePlan.Timestamp = ib.Timestamp
logger.Printf("[INFO] DR restore plan ready: %d apps to restore", len(restorePlan.Apps))
} else {
logger.Println("[WARN] ScanDrivesForBackups returned nil — no restore plan created")
}
}
} else {
logger.Println("[INFO] No infra backup found on Hub for this customer")
@@ -589,7 +592,7 @@ func pushInfraBackup(cfg *config.Config, sett *settings.Settings,
filepath.Join(cfg.Paths.DataDir, "settings.json"),
cfg.Backup.ResticPasswordFile,
cfg.Paths.SystemDataPath,
sett, stackProv,
sett, stackProv, logger,
)
if err != nil {
logger.Printf("[WARN] Failed to build infra backup: %v", err)
@@ -621,8 +624,9 @@ func restorePasswordsFromHub(ib *report.InfraBackup, cfg *config.Config,
decoded, err := base64.StdEncoding.DecodeString(ib.ResticPassword)
if err == nil && len(decoded) > 0 {
dir := filepath.Dir(cfg.Backup.ResticPasswordFile)
os.MkdirAll(dir, 0700)
if err := os.WriteFile(cfg.Backup.ResticPasswordFile, decoded, 0600); err == nil {
if err := os.MkdirAll(dir, 0700); err != nil {
logger.Printf("[WARN] Failed to create restic password directory %s: %v", dir, err)
} else if err := os.WriteFile(cfg.Backup.ResticPasswordFile, decoded, 0600); err == nil {
logger.Println("[INFO] Primary restic password restored from Hub")
} else {
logger.Printf("[WARN] Failed to write restic password file: %v", err)
@@ -649,6 +653,10 @@ func restoreSettingsFromHub(ib *report.InfraBackup, cfg *config.Config, logger *
logger.Printf("[WARN] Failed to decode settings from Hub: %v", err)
return
}
if err := os.MkdirAll(cfg.Paths.DataDir, 0755); err != nil {
logger.Printf("[WARN] Failed to create data directory for settings restore: %v", err)
return
}
settingsPath := filepath.Join(cfg.Paths.DataDir, "settings.json")
if err := os.WriteFile(settingsPath, decoded, 0600); err != nil {
logger.Printf("[WARN] Failed to write restored settings.json: %v", err)
+39 -3
View File
@@ -71,14 +71,46 @@ func (rp *RestorePlan) GetApps() []RestorableApp {
func (rp *RestorePlan) Snapshot() map[string]interface{} {
rp.mu.RLock()
defer rp.mu.RUnlock()
apps := make([]RestorableApp, len(rp.Apps))
copy(apps, rp.Apps)
drives := make([]DriveInfo, len(rp.Drives))
copy(drives, rp.Drives)
return map[string]interface{}{
"ok": true,
"status": rp.Status,
"apps": rp.Apps,
"drives": rp.Drives,
"apps": apps,
"drives": drives,
}
}
// TryStartRestore atomically sets status to "restoring" if not already restoring.
// Returns false if a restore is already in progress (prevents double-restore race).
func (rp *RestorePlan) TryStartRestore() bool {
rp.mu.Lock()
defer rp.mu.Unlock()
if rp.Status == "restoring" {
return false
}
rp.Status = "restoring"
return true
}
// SetStatus sets the overall plan status under lock.
func (rp *RestorePlan) SetStatus(status string) {
rp.mu.Lock()
defer rp.mu.Unlock()
rp.Status = status
}
// GetStatus returns the current plan status under lock.
func (rp *RestorePlan) GetStatus() string {
rp.mu.RLock()
defer rp.mu.RUnlock()
return rp.Status
}
// UpdateApp updates a single app's status in the plan.
func (rp *RestorePlan) UpdateApp(name, status, errMsg string) {
rp.mu.Lock()
@@ -231,9 +263,13 @@ func dirExists(path string) bool {
}
// dirIsEmpty returns true if a directory has no entries.
// Returns false on read errors (assume non-empty — safer for backup detection).
func dirIsEmpty(path string) bool {
entries, err := os.ReadDir(path)
return err != nil || len(entries) == 0
if err != nil {
return false
}
return len(entries) == 0
}
// hasUserData checks if the rsync backup dir has user data (not just _config/_db).
+16 -6
View File
@@ -2,6 +2,8 @@ package report
import (
"encoding/base64"
"fmt"
"log"
"os"
"time"
@@ -43,6 +45,7 @@ func BuildInfraBackup(
systemDataPath string,
sett *settings.Settings,
stackProvider backup.StackDataProvider,
logger *log.Logger,
) (*InfraBackup, error) {
ib := &InfraBackup{
CustomerID: customerID,
@@ -51,22 +54,29 @@ func BuildInfraBackup(
Timestamp: time.Now().UTC().Format(time.RFC3339),
}
// Read and encode controller.yaml
if data, err := os.ReadFile(controllerYAMLPath); err == nil {
ib.ControllerConfigB64 = base64.StdEncoding.EncodeToString(data)
// Read and encode controller.yaml (critical — fail if unreadable)
data, err := os.ReadFile(controllerYAMLPath)
if err != nil {
return nil, fmt.Errorf("reading controller config %s: %w", controllerYAMLPath, err)
}
ib.ControllerConfigB64 = base64.StdEncoding.EncodeToString(data)
// Read and encode settings.json
// Read and encode settings.json (important but non-fatal)
if data, err := os.ReadFile(settingsPath); err == nil {
ib.SettingsJSONB64 = base64.StdEncoding.EncodeToString(data)
} else if !os.IsNotExist(err) {
logger.Printf("[WARN] Infra backup: could not read settings.json: %v", err)
}
// Read primary restic password
// Read primary restic password (important but non-fatal)
if data, err := os.ReadFile(resticPasswordFile); err == nil {
ib.ResticPassword = base64.StdEncoding.EncodeToString(data)
} else if !os.IsNotExist(err) {
logger.Printf("[WARN] Infra backup: could not read restic password file: %v", err)
}
// Read cross-drive restic password
// Cross-drive password is stored as plain text (not base64) because it's
// already a string in settings, unlike ResticPassword which comes from a file.
if pw := sett.GetCrossDriveResticPassword(); pw != "" {
ib.CrossDrivePassword = pw
}
+47 -28
View File
@@ -11,21 +11,32 @@ import (
// restorePageHandler renders the full-page DR restore UI.
func (s *Server) restorePageHandler(w http.ResponseWriter, r *http.Request) {
if s.restorePlan == nil {
s.restoreMu.RLock()
plan := s.restorePlan
if plan == nil {
s.restoreMu.RUnlock()
http.Redirect(w, r, "/", http.StatusFound)
return
}
// Snapshot all needed fields under lock before rendering
customerID := plan.CustomerID
timestamp := plan.Timestamp
apps := plan.GetApps()
drives := make([]backup.DriveInfo, len(plan.Drives))
copy(drives, plan.Drives)
status := plan.GetStatus()
s.restoreMu.RUnlock()
data := map[string]interface{}{
"Title": "Katasztrófa utáni visszaállítás",
"CustomerName": s.cfg.Customer.Name,
"Domain": s.cfg.Customer.Domain,
"Version": s.version,
"CustomerID": s.restorePlan.CustomerID,
"Timestamp": s.restorePlan.Timestamp,
"Apps": s.restorePlan.GetApps(),
"Drives": s.restorePlan.Drives,
"PlanStatus": s.restorePlan.Status,
"CustomerID": customerID,
"Timestamp": timestamp,
"Apps": apps,
"Drives": drives,
"PlanStatus": status,
}
s.render(w, "restore", data)
@@ -33,27 +44,33 @@ func (s *Server) restorePageHandler(w http.ResponseWriter, r *http.Request) {
// apiRestoreStatus returns the current restore plan status as JSON.
func (s *Server) apiRestoreStatus(w http.ResponseWriter, r *http.Request) {
if s.restorePlan == nil {
s.restoreMu.RLock()
plan := s.restorePlan
if plan == nil {
s.restoreMu.RUnlock()
jsonError(w, "not in restore mode", http.StatusBadRequest)
return
}
snapshot := plan.Snapshot()
s.restoreMu.RUnlock()
w.Header().Set("Content-Type", "application/json; charset=utf-8")
json.NewEncoder(w).Encode(s.restorePlan.Snapshot())
json.NewEncoder(w).Encode(snapshot)
}
// apiRestoreAll starts restoring all pending apps sequentially.
func (s *Server) apiRestoreAll(w http.ResponseWriter, r *http.Request) {
if s.restorePlan == nil {
s.restoreMu.RLock()
plan := s.restorePlan
s.restoreMu.RUnlock()
if plan == nil {
jsonError(w, "not in restore mode", http.StatusBadRequest)
return
}
if s.restorePlan.Status == "restoring" {
if !plan.TryStartRestore() {
jsonError(w, "restore already in progress", http.StatusConflict)
return
}
s.restorePlan.Status = "restoring"
go s.executeAllRestores()
jsonResponse(w, map[string]interface{}{
@@ -64,7 +81,10 @@ func (s *Server) apiRestoreAll(w http.ResponseWriter, r *http.Request) {
// apiRestoreSkip exits restore mode without restoring.
func (s *Server) apiRestoreSkip(w http.ResponseWriter, r *http.Request) {
if s.restorePlan == nil {
s.restoreMu.RLock()
plan := s.restorePlan
s.restoreMu.RUnlock()
if plan == nil {
jsonError(w, "not in restore mode", http.StatusBadRequest)
return
}
@@ -82,13 +102,21 @@ func (s *Server) apiRestoreSkip(w http.ResponseWriter, r *http.Request) {
func (s *Server) executeAllRestores() {
s.logger.Println("[INFO] Starting DR restore for all apps")
for i := range s.restorePlan.Apps {
app := &s.restorePlan.Apps[i]
s.restoreMu.RLock()
plan := s.restorePlan
s.restoreMu.RUnlock()
if plan == nil {
s.logger.Println("[WARN] Restore plan cleared before execution could start")
return
}
for i := range plan.Apps {
app := &plan.Apps[i]
if app.Status != "pending" {
continue
}
s.restorePlan.UpdateApp(app.Name, "restoring", "")
plan.UpdateApp(app.Name, "restoring", "")
s.logger.Printf("[INFO] Restoring app %s (%s)", app.Name, app.DisplayName)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
@@ -96,15 +124,15 @@ func (s *Server) executeAllRestores() {
cancel()
if err != nil {
s.restorePlan.UpdateApp(app.Name, "failed", err.Error())
plan.UpdateApp(app.Name, "failed", err.Error())
s.logger.Printf("[ERROR] Restore failed for %s: %v", app.Name, err)
} else {
s.restorePlan.UpdateApp(app.Name, "done", "")
plan.UpdateApp(app.Name, "done", "")
s.logger.Printf("[INFO] Restore completed for %s", app.Name)
}
}
s.restorePlan.Status = "done"
plan.SetStatus("done")
s.logger.Println("[INFO] All app restores completed")
// Re-scan stacks so dashboard picks up restored apps
@@ -113,15 +141,6 @@ func (s *Server) executeAllRestores() {
s.logger.Printf("[WARN] Post-restore stack scan failed: %v", err)
}
}
// Auto-clear restore mode after a brief delay so user can see final status
go func() {
time.Sleep(5 * time.Second)
// Only auto-clear if user hasn't already navigated away
if s.restorePlan != nil && s.restorePlan.AllDone() {
// Keep plan visible — user clicks "continue to dashboard" to clear
}
}()
}
// clearRestoreMode exits restore mode and returns to normal operation.
+31 -8
View File
@@ -255,10 +255,15 @@
pollStatus();
}
var pollErrors = 0;
function pollStatus() {
fetch('/api/restore/status')
.then(function(resp) { return resp.json(); })
.then(function(resp) {
if (!resp.ok) throw new Error('HTTP ' + resp.status);
return resp.json();
})
.then(function(data) {
pollErrors = 0;
if (!data.ok) return;
updateTable(data.apps || []);
updateProgress(data.apps || []);
@@ -270,23 +275,41 @@
updateActions();
}
})
.catch(function() {});
.catch(function(err) {
pollErrors++;
console.error('Poll error:', err);
if (pollErrors >= 10) {
clearInterval(polling);
polling = null;
var actions = document.getElementById('dr-actions');
if (actions) {
actions.innerHTML = '<p style="color:var(--danger)">Kapcsolat megszakadt. <a href="/restore">Oldal frissítése</a></p>';
}
}
});
}
function updateTable(apps) {
apps.forEach(function(app) {
var cells = document.querySelectorAll('.app-status[data-app="' + app.name + '"]');
cells.forEach(function(cell) {
var html = '<span class="status-' + app.status + '">';
var span = document.createElement('span');
span.className = 'status-' + app.status;
if (app.status === 'restoring') {
html += '<span class="spinner"></span> ';
var spinner = document.createElement('span');
spinner.className = 'spinner';
span.appendChild(spinner);
span.appendChild(document.createTextNode(' '));
}
html += statusText(app.status);
span.appendChild(document.createTextNode(statusText(app.status)));
if (app.error) {
html += ' <span style="font-size:.8rem;color:var(--danger)">(' + app.error.substring(0, 60) + ')</span>';
var errSpan = document.createElement('span');
errSpan.style.cssText = 'font-size:.8rem;color:var(--danger)';
errSpan.textContent = ' (' + app.error.substring(0, 60) + ')';
span.appendChild(errSpan);
}
html += '</span>';
cell.innerHTML = html;
cell.innerHTML = '';
cell.appendChild(span);
});
});
}