feat: controller-side HTTP/TCP health probes

Add network-level health probing from the controller to deployed apps.
The controller probes containers over the shared Docker network and
overrides stack state to "unhealthy" if the service isn't responding.

Three probe types: http (any response = alive), api (validates status
code and body content), tcp (port reachability). Configured per-app
via healthcheck: section in .felhom.yml. Runs every minute, per-app
interval defaults to 5 minutes.

This replaces Docker-level healthchecks for distroless images (e.g.
Vikunja) that lack shell utilities, and complements existing Docker
healthchecks for other apps.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 11:11:21 +01:00
parent 077640d9bb
commit 4c5d430b1a
6 changed files with 425 additions and 13 deletions
+323
View File
@@ -0,0 +1,323 @@
package stacks
import (
"fmt"
"io"
"net"
"net/http"
"strings"
"sync"
"time"
)
// probeTarget holds the info needed to probe a single stack.
type probeTarget struct {
stackName string
containerName string
checks []HealthCheckItem
}
// RunHealthProbes runs controller-side health probes for all running stacks
// that have healthcheck configuration and whose interval has elapsed.
// Called by the scheduler every minute.
func (m *Manager) RunHealthProbes() error {
// Phase 1: collect targets (under lock)
m.mu.RLock()
var targets []probeTarget
for name, stack := range m.stacks {
if stack.State != StateRunning && stack.State != StateUnhealthy {
continue
}
hc := stack.Meta.HealthCheck
if hc == nil || len(hc.Checks) == 0 {
continue
}
// Check if interval has elapsed since last probe
interval := parseInterval(hc.Interval)
if stack.HealthProbe != nil && time.Since(stack.HealthProbe.LastCheck) < interval {
continue
}
// Find the main container to probe (matching stack name)
containerName := findProbeContainer(name, stack.Containers)
if containerName == "" {
continue
}
targets = append(targets, probeTarget{
stackName: name,
containerName: containerName,
checks: hc.Checks,
})
}
m.mu.RUnlock()
if len(targets) == 0 {
return nil
}
// Phase 2: run all probes concurrently (no lock held)
type probeResult struct {
stackName string
result *HealthProbeResult
}
results := make([]probeResult, len(targets))
var wg sync.WaitGroup
for i, t := range targets {
wg.Add(1)
go func(idx int, t probeTarget) {
defer wg.Done()
result := m.runChecks(t)
results[idx] = probeResult{stackName: t.stackName, result: result}
}(i, t)
}
wg.Wait()
// Phase 3: apply results and log (under lock)
m.mu.Lock()
okCount, failCount := 0, 0
for _, pr := range results {
stack, ok := m.stacks[pr.stackName]
if !ok {
continue
}
stack.HealthProbe = pr.result
if pr.result.Healthy {
okCount++
// If Docker says running and probe is healthy, ensure state is running
// (clears a previous unhealthy override)
if stack.State == StateUnhealthy {
stack.State = StateRunning
}
} else {
failCount++
if stack.State == StateRunning {
stack.State = StateUnhealthy
}
}
}
m.mu.Unlock()
// Summary log
if failCount > 0 {
m.logger.Printf("[INFO] Health probes: %d ok, %d unhealthy (of %d probed)", okCount, failCount, len(targets))
} else if m.isDebug() {
m.logger.Printf("[DEBUG] Health probes: %d ok (of %d probed)", okCount, len(targets))
}
return nil
}
// runChecks executes all health check items for a single stack target.
func (m *Manager) runChecks(t probeTarget) *HealthProbeResult {
result := &HealthProbeResult{
LastCheck: time.Now(),
Healthy: true,
}
for _, check := range t.checks {
detail := m.runSingleCheck(t.containerName, check)
result.Details = append(result.Details, detail)
if detail.Healthy {
if m.isDebug() {
if detail.Status > 0 {
m.logger.Printf("[DEBUG] Health probe %s: %s %s :%d%s → %d (%s)",
t.stackName, strings.ToUpper(check.Type), methodOrEmpty(check), check.Port, check.Path, detail.Status, detail.Latency)
} else {
m.logger.Printf("[DEBUG] Health probe %s: TCP :%d → ok (%s)",
t.stackName, check.Port, detail.Latency)
}
}
} else {
result.Healthy = false
m.logger.Printf("[WARN] Health probe %s: %s %s :%d%s → %s",
t.stackName, strings.ToUpper(check.Type), methodOrEmpty(check), check.Port, check.Path, detail.Error)
}
}
return result
}
// runSingleCheck executes one health check item and returns the result.
func (m *Manager) runSingleCheck(containerName string, check HealthCheckItem) HealthCheckDetail {
target := fmt.Sprintf(":%d%s", check.Port, check.Path)
switch check.Type {
case "tcp":
return m.probeTCP(containerName, check.Port, target)
case "http", "api":
return m.probeHTTP(containerName, check, target)
default:
return HealthCheckDetail{
Type: check.Type,
Target: target,
Healthy: false,
Error: fmt.Sprintf("unknown check type: %s", check.Type),
}
}
}
// probeTCP tests if a TCP port is reachable on the container.
func (m *Manager) probeTCP(containerName string, port int, target string) HealthCheckDetail {
start := time.Now()
addr := fmt.Sprintf("%s:%d", containerName, port)
conn, err := net.DialTimeout("tcp", addr, 5*time.Second)
latency := time.Since(start)
detail := HealthCheckDetail{
Type: "tcp",
Target: target,
Latency: formatLatency(latency),
}
if err != nil {
detail.Healthy = false
detail.Error = err.Error()
} else {
conn.Close()
detail.Healthy = true
}
return detail
}
// probeHTTP makes an HTTP request to the container and evaluates the result.
// For "http" type: any response = healthy. For "api" type: validates expect rules.
func (m *Manager) probeHTTP(containerName string, check HealthCheckItem, target string) HealthCheckDetail {
url := fmt.Sprintf("http://%s:%d%s", containerName, check.Port, check.Path)
method := check.Method
if method == "" {
method = "GET"
}
start := time.Now()
client := &http.Client{
Timeout: 5 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
req, err := http.NewRequest(method, url, nil)
if err != nil {
return HealthCheckDetail{
Type: check.Type,
Target: target,
Healthy: false,
Error: fmt.Sprintf("bad request: %v", err),
Latency: "0ms",
}
}
resp, err := client.Do(req)
latency := time.Since(start)
detail := HealthCheckDetail{
Type: check.Type,
Target: target,
Latency: formatLatency(latency),
}
if err != nil {
detail.Healthy = false
detail.Error = err.Error()
return detail
}
defer resp.Body.Close()
detail.Status = resp.StatusCode
// For "http" type, any response means the service is alive
if check.Type == "http" {
detail.Healthy = true
return detail
}
// For "api" type, validate expectations
if check.Expect == nil {
// No expectations = just check for a response (same as http)
detail.Healthy = true
return detail
}
// Check expected status code
if check.Expect.Status > 0 && resp.StatusCode != check.Expect.Status {
detail.Healthy = false
detail.Error = fmt.Sprintf("expected status %d, got %d", check.Expect.Status, resp.StatusCode)
return detail
}
// Check expected body content
if check.Expect.BodyContains != "" {
body, err := io.ReadAll(io.LimitReader(resp.Body, 8192)) // read up to 8KB
if err != nil {
detail.Healthy = false
detail.Error = fmt.Sprintf("reading body: %v", err)
return detail
}
if !strings.Contains(string(body), check.Expect.BodyContains) {
detail.Healthy = false
detail.Error = fmt.Sprintf("body missing expected string %q", check.Expect.BodyContains)
return detail
}
}
detail.Healthy = true
return detail
}
// findProbeContainer returns the container name to probe for a stack.
// Prefers exact match with stack name, then prefix match (stack-service-N).
func findProbeContainer(stackName string, containers []ContainerInfo) string {
for _, c := range containers {
if c.Name == stackName && (c.State == StateRunning || c.State == StateUnhealthy) {
return c.Name
}
}
// Fallback: first running container with matching prefix
for _, c := range containers {
if strings.HasPrefix(c.Name, stackName) && (c.State == StateRunning || c.State == StateUnhealthy) {
return c.Name
}
}
return ""
}
// parseInterval parses a duration string like "5m", "30s", "1h".
// Returns 5 minutes as default if parsing fails.
func parseInterval(s string) time.Duration {
if s == "" {
return 5 * time.Minute
}
d, err := time.ParseDuration(s)
if err != nil {
return 5 * time.Minute
}
return d
}
// formatLatency formats a duration as a human-readable latency string.
func formatLatency(d time.Duration) string {
if d < time.Millisecond {
return fmt.Sprintf("%dµs", d.Microseconds())
}
if d < time.Second {
return fmt.Sprintf("%dms", d.Milliseconds())
}
return fmt.Sprintf("%.1fs", d.Seconds())
}
// methodOrEmpty returns the method string for logging, or empty for non-api checks.
func methodOrEmpty(check HealthCheckItem) string {
if check.Type == "api" && check.Method != "" {
return check.Method
}
if check.Type == "api" {
return "GET"
}
return "GET"
}
+37 -12
View File
@@ -42,20 +42,38 @@ type ContainerInfo struct {
Status string `json:"status"` // e.g. "Up 3 hours (healthy)"
}
// HealthProbeResult holds the latest controller-side health probe result.
type HealthProbeResult struct {
Healthy bool `json:"healthy"`
LastCheck time.Time `json:"last_check"`
Details []HealthCheckDetail `json:"details"`
}
// HealthCheckDetail holds the result of a single health check item.
type HealthCheckDetail struct {
Type string `json:"type"` // "http", "api", "tcp"
Target string `json:"target"` // e.g. ":3456/api/v1/info"
Healthy bool `json:"healthy"`
Status int `json:"status,omitempty"` // HTTP status code (for http/api)
Latency string `json:"latency"` // e.g. "45ms"
Error string `json:"error,omitempty"` // error message if unhealthy
}
// Stack represents a docker compose stack on disk.
type Stack struct {
Name string `json:"name"`
Meta Metadata `json:"meta"`
ComposePath string `json:"compose_path"`
State ContainerState `json:"state"`
Deployed bool `json:"deployed"` // Has app.yaml with deployed=true
Protected bool `json:"protected"`
Orphaned bool `json:"orphaned"` // Deployed but no catalog template
Containers []ContainerInfo `json:"containers"`
AppConfig *AppConfig `json:"app_config,omitempty"`
Deploying bool `json:"deploying"` // compose up in progress
DeployError string `json:"deploy_error,omitempty"` // last async deploy error
LastUpdated time.Time `json:"last_updated"`
Name string `json:"name"`
Meta Metadata `json:"meta"`
ComposePath string `json:"compose_path"`
State ContainerState `json:"state"`
Deployed bool `json:"deployed"` // Has app.yaml with deployed=true
Protected bool `json:"protected"`
Orphaned bool `json:"orphaned"` // Deployed but no catalog template
Containers []ContainerInfo `json:"containers"`
AppConfig *AppConfig `json:"app_config,omitempty"`
Deploying bool `json:"deploying"` // compose up in progress
DeployError string `json:"deploy_error,omitempty"` // last async deploy error
HealthProbe *HealthProbeResult `json:"health_probe,omitempty"` // controller-side probe result
LastUpdated time.Time `json:"last_updated"`
}
// Manager handles all docker compose stack operations.
@@ -315,6 +333,13 @@ func (m *Manager) refreshStatusLocked() error {
stack.Containers = containers
stack.State = aggregateState(containers)
}
// Re-apply controller-side health probe results: if the last probe
// failed and Docker thinks the container is running, override to unhealthy.
if stack.State == StateRunning && stack.HealthProbe != nil && !stack.HealthProbe.Healthy {
stack.State = StateUnhealthy
}
stack.LastUpdated = time.Now()
}
+39
View File
@@ -20,6 +20,7 @@ type Metadata struct {
DeployFields []DeployField `yaml:"deploy_fields" json:"deploy_fields"`
AppInfo AppInfo `yaml:"app_info" json:"app_info"`
OptionalConfig []OptionalConfigGroup `yaml:"optional_config" json:"optional_config"`
HealthCheck *HealthCheckConfig `yaml:"healthcheck,omitempty" json:"healthcheck,omitempty"`
}
// AppInfo holds detailed app information for the info page.
@@ -77,6 +78,29 @@ type SelectOption struct {
Label string `yaml:"label" json:"label"`
}
// HealthCheckConfig defines controller-side health probe configuration.
// When configured, the controller periodically probes the app's container
// and overrides the stack state to "unhealthy" if the service is not responding.
type HealthCheckConfig struct {
Interval string `yaml:"interval" json:"interval"` // e.g. "5m", "30s"; default "5m"
Checks []HealthCheckItem `yaml:"checks" json:"checks"`
}
// HealthCheckItem defines a single health check probe.
type HealthCheckItem struct {
Type string `yaml:"type" json:"type"` // "http", "api", "tcp"
Port int `yaml:"port" json:"port"`
Path string `yaml:"path" json:"path"` // for http/api; default "/"
Method string `yaml:"method" json:"method"` // for api; default "GET"
Expect *HealthCheckExpect `yaml:"expect,omitempty" json:"expect,omitempty"` // for api
}
// HealthCheckExpect defines expected response content for "api" type checks.
type HealthCheckExpect struct {
Status int `yaml:"status" json:"status"` // expected HTTP status code
BodyContains string `yaml:"body_contains" json:"body_contains"` // string that must appear in response body
}
// LoadMetadata reads .felhom.yml from a stack directory.
// Returns default metadata if the file doesn't exist.
func LoadMetadata(stackDir string) Metadata {
@@ -113,6 +137,21 @@ func LoadMetadata(stackDir string) Metadata {
meta.Category = "tools"
}
// Default healthcheck fields
if meta.HealthCheck != nil {
if meta.HealthCheck.Interval == "" {
meta.HealthCheck.Interval = "5m"
}
for i := range meta.HealthCheck.Checks {
if meta.HealthCheck.Checks[i].Path == "" && (meta.HealthCheck.Checks[i].Type == "http" || meta.HealthCheck.Checks[i].Type == "api") {
meta.HealthCheck.Checks[i].Path = "/"
}
if meta.HealthCheck.Checks[i].Method == "" && meta.HealthCheck.Checks[i].Type == "api" {
meta.HealthCheck.Checks[i].Method = "GET"
}
}
}
// DOMAIN and SUBDOMAIN fields are always auto-filled/required — mark implicitly
for i := range meta.DeployFields {
if meta.DeployFields[i].Type == "domain" || meta.DeployFields[i].Type == "subdomain" {