diff --git a/CHANGELOG.md b/CHANGELOG.md index 4fb57fe..1fb8f0b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ ## Changelog +### v0.29.3 — Controller-side Health Probes (2026-02-25) + +#### Added +- **HTTP/TCP health probes** (`internal/stacks/healthprobe.go`) — The controller now probes deployed apps directly over the Docker network to verify services are actually responding, not just that containers are running. Runs every minute, configurable per-app interval (default 5 min). +- **Three probe types**: `http` (any response = alive), `api` (validates status code and response body), `tcp` (port reachability). Multiple checks per app supported. +- **`.felhom.yml` healthcheck config** (`internal/stacks/metadata.go`) — New `healthcheck:` section with `interval`, `checks[]` (type, port, path, method, expect). Parsed from app catalog metadata. +- **State override** (`internal/stacks/manager.go`) — If a running container's health probe fails, the stack state is overridden to "unhealthy". Clears automatically when probe passes again. + +#### Fixed +- **Vikunja healthcheck** — Removed Docker-level healthcheck (distroless image has no wget/curl). Controller-side API probe to `:3456/api/v1/info` replaces it. + ### v0.29.2 — Dynamic Logo & Favicon (2026-02-25) #### Changed diff --git a/controller/README.md b/controller/README.md index 0536d90..4e79e13 100644 --- a/controller/README.md +++ b/controller/README.md @@ -212,11 +212,22 @@ When app templates are updated (e.g., a new `APP_KEY` secret is added to `.felho | Running + healthy | Green | "Fut" | All containers running and healthy | | Running + starting | Orange | "Indulas..." | Healthcheck not yet passed | | Deploying | Orange | "Telepítés..." | Compose up in progress (image pull, container creation) | -| Running + unhealthy | Yellow | "Nem egeszseges" | Healthcheck failing | +| Running + unhealthy | Yellow | "Nem egeszseges" | Docker or controller-side healthcheck failing | | Stopped/exited | Red | "Leallitva" | All containers stopped | | Restarting | Yellow | "Ujrainditas..." | Restart loop | | Not deployed | Gray | "Nincs telepitve" | Compose file exists, not deployed | +#### Controller-side Health Probes (`internal/stacks/healthprobe.go`) + +For apps that declare a `healthcheck:` section in `.felhom.yml`, the controller probes the container directly over the Docker network (both are on `traefik-public`). This complements Docker-level healthchecks and is the **only** health mechanism for distroless/scratch images that lack shell utilities. + +Three probe types are supported: +- **`http`** — Any HTTP response (even 4xx/5xx) = service is alive. Only connection refused/timeout = unhealthy. +- **`api`** — HTTP request with response validation (expected status code, body content). Fails if expectations aren't met. +- **`tcp`** — Simple port reachability check via `net.Dial`. + +Multiple checks per app are supported (all must pass). The probe scheduler runs every minute; per-app intervals default to 5 minutes and are configurable via `healthcheck.interval` in `.felhom.yml`. Probe results are stored in `Stack.HealthProbe` and exposed via the API. Failed probes override the stack state to `StateUnhealthy`; the override clears automatically when the next probe passes. + --- ### 2. Backup System diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index 6aa0f88..10f684b 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -220,6 +220,9 @@ func main() { sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error { return stackMgr.ScanStacks() }) + sched.Every("health-probes", 1*time.Minute, func(ctx context.Context) error { + return stackMgr.RunHealthProbes() + }) // Heartbeat — lightweight "I'm alive" signal sched.Every("heartbeat", 5*time.Minute, func(ctx context.Context) error { diff --git a/controller/internal/stacks/healthprobe.go b/controller/internal/stacks/healthprobe.go new file mode 100644 index 0000000..7d8b548 --- /dev/null +++ b/controller/internal/stacks/healthprobe.go @@ -0,0 +1,323 @@ +package stacks + +import ( + "fmt" + "io" + "net" + "net/http" + "strings" + "sync" + "time" +) + +// probeTarget holds the info needed to probe a single stack. +type probeTarget struct { + stackName string + containerName string + checks []HealthCheckItem +} + +// RunHealthProbes runs controller-side health probes for all running stacks +// that have healthcheck configuration and whose interval has elapsed. +// Called by the scheduler every minute. +func (m *Manager) RunHealthProbes() error { + // Phase 1: collect targets (under lock) + m.mu.RLock() + var targets []probeTarget + for name, stack := range m.stacks { + if stack.State != StateRunning && stack.State != StateUnhealthy { + continue + } + hc := stack.Meta.HealthCheck + if hc == nil || len(hc.Checks) == 0 { + continue + } + + // Check if interval has elapsed since last probe + interval := parseInterval(hc.Interval) + if stack.HealthProbe != nil && time.Since(stack.HealthProbe.LastCheck) < interval { + continue + } + + // Find the main container to probe (matching stack name) + containerName := findProbeContainer(name, stack.Containers) + if containerName == "" { + continue + } + + targets = append(targets, probeTarget{ + stackName: name, + containerName: containerName, + checks: hc.Checks, + }) + } + m.mu.RUnlock() + + if len(targets) == 0 { + return nil + } + + // Phase 2: run all probes concurrently (no lock held) + type probeResult struct { + stackName string + result *HealthProbeResult + } + + results := make([]probeResult, len(targets)) + var wg sync.WaitGroup + + for i, t := range targets { + wg.Add(1) + go func(idx int, t probeTarget) { + defer wg.Done() + result := m.runChecks(t) + results[idx] = probeResult{stackName: t.stackName, result: result} + }(i, t) + } + wg.Wait() + + // Phase 3: apply results and log (under lock) + m.mu.Lock() + okCount, failCount := 0, 0 + for _, pr := range results { + stack, ok := m.stacks[pr.stackName] + if !ok { + continue + } + stack.HealthProbe = pr.result + + if pr.result.Healthy { + okCount++ + // If Docker says running and probe is healthy, ensure state is running + // (clears a previous unhealthy override) + if stack.State == StateUnhealthy { + stack.State = StateRunning + } + } else { + failCount++ + if stack.State == StateRunning { + stack.State = StateUnhealthy + } + } + } + m.mu.Unlock() + + // Summary log + if failCount > 0 { + m.logger.Printf("[INFO] Health probes: %d ok, %d unhealthy (of %d probed)", okCount, failCount, len(targets)) + } else if m.isDebug() { + m.logger.Printf("[DEBUG] Health probes: %d ok (of %d probed)", okCount, len(targets)) + } + + return nil +} + +// runChecks executes all health check items for a single stack target. +func (m *Manager) runChecks(t probeTarget) *HealthProbeResult { + result := &HealthProbeResult{ + LastCheck: time.Now(), + Healthy: true, + } + + for _, check := range t.checks { + detail := m.runSingleCheck(t.containerName, check) + result.Details = append(result.Details, detail) + + if detail.Healthy { + if m.isDebug() { + if detail.Status > 0 { + m.logger.Printf("[DEBUG] Health probe %s: %s %s :%d%s → %d (%s)", + t.stackName, strings.ToUpper(check.Type), methodOrEmpty(check), check.Port, check.Path, detail.Status, detail.Latency) + } else { + m.logger.Printf("[DEBUG] Health probe %s: TCP :%d → ok (%s)", + t.stackName, check.Port, detail.Latency) + } + } + } else { + result.Healthy = false + m.logger.Printf("[WARN] Health probe %s: %s %s :%d%s → %s", + t.stackName, strings.ToUpper(check.Type), methodOrEmpty(check), check.Port, check.Path, detail.Error) + } + } + + return result +} + +// runSingleCheck executes one health check item and returns the result. +func (m *Manager) runSingleCheck(containerName string, check HealthCheckItem) HealthCheckDetail { + target := fmt.Sprintf(":%d%s", check.Port, check.Path) + + switch check.Type { + case "tcp": + return m.probeTCP(containerName, check.Port, target) + case "http", "api": + return m.probeHTTP(containerName, check, target) + default: + return HealthCheckDetail{ + Type: check.Type, + Target: target, + Healthy: false, + Error: fmt.Sprintf("unknown check type: %s", check.Type), + } + } +} + +// probeTCP tests if a TCP port is reachable on the container. +func (m *Manager) probeTCP(containerName string, port int, target string) HealthCheckDetail { + start := time.Now() + addr := fmt.Sprintf("%s:%d", containerName, port) + conn, err := net.DialTimeout("tcp", addr, 5*time.Second) + latency := time.Since(start) + + detail := HealthCheckDetail{ + Type: "tcp", + Target: target, + Latency: formatLatency(latency), + } + + if err != nil { + detail.Healthy = false + detail.Error = err.Error() + } else { + conn.Close() + detail.Healthy = true + } + return detail +} + +// probeHTTP makes an HTTP request to the container and evaluates the result. +// For "http" type: any response = healthy. For "api" type: validates expect rules. +func (m *Manager) probeHTTP(containerName string, check HealthCheckItem, target string) HealthCheckDetail { + url := fmt.Sprintf("http://%s:%d%s", containerName, check.Port, check.Path) + method := check.Method + if method == "" { + method = "GET" + } + + start := time.Now() + + client := &http.Client{ + Timeout: 5 * time.Second, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse + }, + } + + req, err := http.NewRequest(method, url, nil) + if err != nil { + return HealthCheckDetail{ + Type: check.Type, + Target: target, + Healthy: false, + Error: fmt.Sprintf("bad request: %v", err), + Latency: "0ms", + } + } + + resp, err := client.Do(req) + latency := time.Since(start) + + detail := HealthCheckDetail{ + Type: check.Type, + Target: target, + Latency: formatLatency(latency), + } + + if err != nil { + detail.Healthy = false + detail.Error = err.Error() + return detail + } + defer resp.Body.Close() + detail.Status = resp.StatusCode + + // For "http" type, any response means the service is alive + if check.Type == "http" { + detail.Healthy = true + return detail + } + + // For "api" type, validate expectations + if check.Expect == nil { + // No expectations = just check for a response (same as http) + detail.Healthy = true + return detail + } + + // Check expected status code + if check.Expect.Status > 0 && resp.StatusCode != check.Expect.Status { + detail.Healthy = false + detail.Error = fmt.Sprintf("expected status %d, got %d", check.Expect.Status, resp.StatusCode) + return detail + } + + // Check expected body content + if check.Expect.BodyContains != "" { + body, err := io.ReadAll(io.LimitReader(resp.Body, 8192)) // read up to 8KB + if err != nil { + detail.Healthy = false + detail.Error = fmt.Sprintf("reading body: %v", err) + return detail + } + if !strings.Contains(string(body), check.Expect.BodyContains) { + detail.Healthy = false + detail.Error = fmt.Sprintf("body missing expected string %q", check.Expect.BodyContains) + return detail + } + } + + detail.Healthy = true + return detail +} + +// findProbeContainer returns the container name to probe for a stack. +// Prefers exact match with stack name, then prefix match (stack-service-N). +func findProbeContainer(stackName string, containers []ContainerInfo) string { + for _, c := range containers { + if c.Name == stackName && (c.State == StateRunning || c.State == StateUnhealthy) { + return c.Name + } + } + // Fallback: first running container with matching prefix + for _, c := range containers { + if strings.HasPrefix(c.Name, stackName) && (c.State == StateRunning || c.State == StateUnhealthy) { + return c.Name + } + } + return "" +} + +// parseInterval parses a duration string like "5m", "30s", "1h". +// Returns 5 minutes as default if parsing fails. +func parseInterval(s string) time.Duration { + if s == "" { + return 5 * time.Minute + } + d, err := time.ParseDuration(s) + if err != nil { + return 5 * time.Minute + } + return d +} + +// formatLatency formats a duration as a human-readable latency string. +func formatLatency(d time.Duration) string { + if d < time.Millisecond { + return fmt.Sprintf("%dµs", d.Microseconds()) + } + if d < time.Second { + return fmt.Sprintf("%dms", d.Milliseconds()) + } + return fmt.Sprintf("%.1fs", d.Seconds()) +} + +// methodOrEmpty returns the method string for logging, or empty for non-api checks. +func methodOrEmpty(check HealthCheckItem) string { + if check.Type == "api" && check.Method != "" { + return check.Method + } + if check.Type == "api" { + return "GET" + } + return "GET" +} diff --git a/controller/internal/stacks/manager.go b/controller/internal/stacks/manager.go index ffa8b0b..c82e156 100644 --- a/controller/internal/stacks/manager.go +++ b/controller/internal/stacks/manager.go @@ -42,20 +42,38 @@ type ContainerInfo struct { Status string `json:"status"` // e.g. "Up 3 hours (healthy)" } +// HealthProbeResult holds the latest controller-side health probe result. +type HealthProbeResult struct { + Healthy bool `json:"healthy"` + LastCheck time.Time `json:"last_check"` + Details []HealthCheckDetail `json:"details"` +} + +// HealthCheckDetail holds the result of a single health check item. +type HealthCheckDetail struct { + Type string `json:"type"` // "http", "api", "tcp" + Target string `json:"target"` // e.g. ":3456/api/v1/info" + Healthy bool `json:"healthy"` + Status int `json:"status,omitempty"` // HTTP status code (for http/api) + Latency string `json:"latency"` // e.g. "45ms" + Error string `json:"error,omitempty"` // error message if unhealthy +} + // Stack represents a docker compose stack on disk. type Stack struct { - Name string `json:"name"` - Meta Metadata `json:"meta"` - ComposePath string `json:"compose_path"` - State ContainerState `json:"state"` - Deployed bool `json:"deployed"` // Has app.yaml with deployed=true - Protected bool `json:"protected"` - Orphaned bool `json:"orphaned"` // Deployed but no catalog template - Containers []ContainerInfo `json:"containers"` - AppConfig *AppConfig `json:"app_config,omitempty"` - Deploying bool `json:"deploying"` // compose up in progress - DeployError string `json:"deploy_error,omitempty"` // last async deploy error - LastUpdated time.Time `json:"last_updated"` + Name string `json:"name"` + Meta Metadata `json:"meta"` + ComposePath string `json:"compose_path"` + State ContainerState `json:"state"` + Deployed bool `json:"deployed"` // Has app.yaml with deployed=true + Protected bool `json:"protected"` + Orphaned bool `json:"orphaned"` // Deployed but no catalog template + Containers []ContainerInfo `json:"containers"` + AppConfig *AppConfig `json:"app_config,omitempty"` + Deploying bool `json:"deploying"` // compose up in progress + DeployError string `json:"deploy_error,omitempty"` // last async deploy error + HealthProbe *HealthProbeResult `json:"health_probe,omitempty"` // controller-side probe result + LastUpdated time.Time `json:"last_updated"` } // Manager handles all docker compose stack operations. @@ -315,6 +333,13 @@ func (m *Manager) refreshStatusLocked() error { stack.Containers = containers stack.State = aggregateState(containers) } + + // Re-apply controller-side health probe results: if the last probe + // failed and Docker thinks the container is running, override to unhealthy. + if stack.State == StateRunning && stack.HealthProbe != nil && !stack.HealthProbe.Healthy { + stack.State = StateUnhealthy + } + stack.LastUpdated = time.Now() } diff --git a/controller/internal/stacks/metadata.go b/controller/internal/stacks/metadata.go index 99f3aef..a4e47cd 100644 --- a/controller/internal/stacks/metadata.go +++ b/controller/internal/stacks/metadata.go @@ -20,6 +20,7 @@ type Metadata struct { DeployFields []DeployField `yaml:"deploy_fields" json:"deploy_fields"` AppInfo AppInfo `yaml:"app_info" json:"app_info"` OptionalConfig []OptionalConfigGroup `yaml:"optional_config" json:"optional_config"` + HealthCheck *HealthCheckConfig `yaml:"healthcheck,omitempty" json:"healthcheck,omitempty"` } // AppInfo holds detailed app information for the info page. @@ -77,6 +78,29 @@ type SelectOption struct { Label string `yaml:"label" json:"label"` } +// HealthCheckConfig defines controller-side health probe configuration. +// When configured, the controller periodically probes the app's container +// and overrides the stack state to "unhealthy" if the service is not responding. +type HealthCheckConfig struct { + Interval string `yaml:"interval" json:"interval"` // e.g. "5m", "30s"; default "5m" + Checks []HealthCheckItem `yaml:"checks" json:"checks"` +} + +// HealthCheckItem defines a single health check probe. +type HealthCheckItem struct { + Type string `yaml:"type" json:"type"` // "http", "api", "tcp" + Port int `yaml:"port" json:"port"` + Path string `yaml:"path" json:"path"` // for http/api; default "/" + Method string `yaml:"method" json:"method"` // for api; default "GET" + Expect *HealthCheckExpect `yaml:"expect,omitempty" json:"expect,omitempty"` // for api +} + +// HealthCheckExpect defines expected response content for "api" type checks. +type HealthCheckExpect struct { + Status int `yaml:"status" json:"status"` // expected HTTP status code + BodyContains string `yaml:"body_contains" json:"body_contains"` // string that must appear in response body +} + // LoadMetadata reads .felhom.yml from a stack directory. // Returns default metadata if the file doesn't exist. func LoadMetadata(stackDir string) Metadata { @@ -113,6 +137,21 @@ func LoadMetadata(stackDir string) Metadata { meta.Category = "tools" } + // Default healthcheck fields + if meta.HealthCheck != nil { + if meta.HealthCheck.Interval == "" { + meta.HealthCheck.Interval = "5m" + } + for i := range meta.HealthCheck.Checks { + if meta.HealthCheck.Checks[i].Path == "" && (meta.HealthCheck.Checks[i].Type == "http" || meta.HealthCheck.Checks[i].Type == "api") { + meta.HealthCheck.Checks[i].Path = "/" + } + if meta.HealthCheck.Checks[i].Method == "" && meta.HealthCheck.Checks[i].Type == "api" { + meta.HealthCheck.Checks[i].Method = "GET" + } + } + } + // DOMAIN and SUBDOMAIN fields are always auto-filled/required — mark implicitly for i := range meta.DeployFields { if meta.DeployFields[i].Type == "domain" || meta.DeployFields[i].Type == "subdomain" {