v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups
Phase 2 (Monitoring & Health): - Central job scheduler replacing ad-hoc goroutines (internal/scheduler) - CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go) - Temperature reading from /sys/class/thermal + /host/sys (Docker mount) - Load average from /proc/loadavg - Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go) - System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go) Phase 3 (Backups): - Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go) - Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes - Restic backup integration with auto-password generation (internal/backup/restic.go) - Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go) - Manual backup trigger via dashboard button and POST /api/backup/run Dashboard UI: - CPU usage bar with load average display - Temperature with colored indicator dot - Backup status card with last run time, DB count, repo stats - "Mentés most" button for manual backup trigger Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
//go:build linux
|
||||
|
||||
package system
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CPUCollector samples CPU usage in the background by reading /proc/stat.
|
||||
type CPUCollector struct {
|
||||
mu sync.RWMutex
|
||||
cpuPercent float64
|
||||
sampleRate time.Duration
|
||||
cancel context.CancelFunc
|
||||
}
|
||||
|
||||
// NewCPUCollector creates a new CPU collector with the given sample rate.
|
||||
func NewCPUCollector(sampleRate time.Duration) *CPUCollector {
|
||||
return &CPUCollector{
|
||||
sampleRate: sampleRate,
|
||||
}
|
||||
}
|
||||
|
||||
// Start begins background CPU sampling.
|
||||
func (c *CPUCollector) Start(ctx context.Context) {
|
||||
ctx, c.cancel = context.WithCancel(ctx)
|
||||
go c.loop(ctx)
|
||||
}
|
||||
|
||||
// Stop stops the background CPU sampling.
|
||||
func (c *CPUCollector) Stop() {
|
||||
if c.cancel != nil {
|
||||
c.cancel()
|
||||
}
|
||||
}
|
||||
|
||||
// CPUPercent returns the latest CPU usage percentage (0-100).
|
||||
func (c *CPUCollector) CPUPercent() float64 {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
return c.cpuPercent
|
||||
}
|
||||
|
||||
func (c *CPUCollector) loop(ctx context.Context) {
|
||||
for {
|
||||
// Read first sample
|
||||
idle1, total1, err := readCPUStat()
|
||||
if err != nil {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(c.sampleRate):
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for sample interval
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(c.sampleRate):
|
||||
}
|
||||
|
||||
// Read second sample
|
||||
idle2, total2, err := readCPUStat()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
totalDelta := total2 - total1
|
||||
idleDelta := idle2 - idle1
|
||||
|
||||
if totalDelta > 0 {
|
||||
busyDelta := totalDelta - idleDelta
|
||||
percent := float64(busyDelta) / float64(totalDelta) * 100
|
||||
c.mu.Lock()
|
||||
c.cpuPercent = percent
|
||||
c.mu.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// readCPUStat reads /proc/stat and returns idle and total CPU jiffies.
|
||||
// First line format: cpu <user> <nice> <system> <idle> <iowait> <irq> <softirq> <steal>
|
||||
func readCPUStat() (idle, total uint64, err error) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
if !scanner.Scan() {
|
||||
return 0, 0, fmt.Errorf("empty /proc/stat")
|
||||
}
|
||||
|
||||
line := scanner.Text()
|
||||
if !strings.HasPrefix(line, "cpu ") {
|
||||
return 0, 0, fmt.Errorf("unexpected /proc/stat first line: %s", line)
|
||||
}
|
||||
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 9 {
|
||||
return 0, 0, fmt.Errorf("/proc/stat has too few fields: %d", len(fields))
|
||||
}
|
||||
|
||||
// Fields: cpu user(1) nice(2) system(3) idle(4) iowait(5) irq(6) softirq(7) steal(8)
|
||||
var values [8]uint64
|
||||
for i := 0; i < 8; i++ {
|
||||
var v uint64
|
||||
for _, c := range fields[i+1] {
|
||||
if c >= '0' && c <= '9' {
|
||||
v = v*10 + uint64(c-'0')
|
||||
}
|
||||
}
|
||||
values[i] = v
|
||||
}
|
||||
|
||||
// idle_total = idle + iowait
|
||||
idleTotal := values[3] + values[4]
|
||||
// total = sum of all
|
||||
var totalVal uint64
|
||||
for _, v := range values {
|
||||
totalVal += v
|
||||
}
|
||||
|
||||
return idleTotal, totalVal, nil
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
//go:build !linux
|
||||
|
||||
package system
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
)
|
||||
|
||||
// CPUCollector is a no-op on non-Linux platforms.
|
||||
type CPUCollector struct{}
|
||||
|
||||
// NewCPUCollector creates a no-op CPU collector on non-Linux platforms.
|
||||
func NewCPUCollector(_ time.Duration) *CPUCollector {
|
||||
return &CPUCollector{}
|
||||
}
|
||||
|
||||
// Start is a no-op on non-Linux platforms.
|
||||
func (c *CPUCollector) Start(_ context.Context) {}
|
||||
|
||||
// Stop is a no-op on non-Linux platforms.
|
||||
func (c *CPUCollector) Stop() {}
|
||||
|
||||
// CPUPercent always returns 0 on non-Linux platforms.
|
||||
func (c *CPUCollector) CPUPercent() float64 { return 0 }
|
||||
@@ -17,4 +17,11 @@ type SystemInfo struct {
|
||||
HDDAvailGB float64 `json:"hdd_avail_gb,omitempty"`
|
||||
HDDPercent float64 `json:"hdd_percent,omitempty"`
|
||||
HDDConfigured bool `json:"hdd_configured"`
|
||||
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
LoadAvg1 float64 `json:"load_avg_1"`
|
||||
LoadAvg5 float64 `json:"load_avg_5"`
|
||||
LoadAvg15 float64 `json:"load_avg_15"`
|
||||
TemperatureCelsius float64 `json:"temperature_celsius"`
|
||||
TemperatureSource string `json:"temperature_source,omitempty"`
|
||||
}
|
||||
|
||||
@@ -6,13 +6,16 @@ import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// GetInfo reads system memory and disk usage.
|
||||
// GetInfo reads system memory, disk, CPU, load, and temperature info.
|
||||
// hddPath is the mount path for external HDD; if empty, HDD info is skipped.
|
||||
func GetInfo(hddPath string) SystemInfo {
|
||||
// cpuCollector provides the latest CPU usage sample; may be nil.
|
||||
func GetInfo(hddPath string, cpuCollector *CPUCollector) SystemInfo {
|
||||
info := SystemInfo{}
|
||||
|
||||
// --- Memory from /proc/meminfo ---
|
||||
@@ -27,6 +30,17 @@ func GetInfo(hddPath string) SystemInfo {
|
||||
readDiskUsage(hddPath, &info.HDDTotalGB, &info.HDDUsedGB, &info.HDDAvailGB, &info.HDDPercent)
|
||||
}
|
||||
|
||||
// --- Load average ---
|
||||
readLoadAvg(&info)
|
||||
|
||||
// --- Temperature ---
|
||||
readTemperature(&info)
|
||||
|
||||
// --- CPU from collector ---
|
||||
if cpuCollector != nil {
|
||||
info.CPUPercent = cpuCollector.CPUPercent()
|
||||
}
|
||||
|
||||
return info
|
||||
}
|
||||
|
||||
@@ -72,7 +86,6 @@ func readMemInfo(info *SystemInfo) {
|
||||
|
||||
// parseMemLine extracts the kB value from a /proc/meminfo line like "MemTotal: 16384000 kB"
|
||||
func parseMemLine(line string) uint64 {
|
||||
// Remove label prefix up to ':'
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) < 2 {
|
||||
return 0
|
||||
@@ -99,7 +112,7 @@ func readDiskUsage(path string, totalGB, usedGB, availGB *float64, percent *floa
|
||||
bsize := uint64(stat.Bsize)
|
||||
total := stat.Blocks * bsize
|
||||
avail := stat.Bavail * bsize
|
||||
used := total - (stat.Bfree * bsize) // Bfree includes reserved blocks
|
||||
used := total - (stat.Bfree * bsize)
|
||||
|
||||
const gb = 1024 * 1024 * 1024
|
||||
*totalGB = float64(total) / gb
|
||||
@@ -109,3 +122,117 @@ func readDiskUsage(path string, totalGB, usedGB, availGB *float64, percent *floa
|
||||
*percent = float64(used) / float64(total) * 100
|
||||
}
|
||||
}
|
||||
|
||||
// readLoadAvg reads 1/5/15 minute load averages from /proc/loadavg.
|
||||
func readLoadAvg(info *SystemInfo) {
|
||||
data, err := os.ReadFile("/proc/loadavg")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
fmt.Sscanf(string(data), "%f %f %f", &info.LoadAvg1, &info.LoadAvg5, &info.LoadAvg15)
|
||||
}
|
||||
|
||||
// readTemperature reads CPU/SoC temperature from thermal zones.
|
||||
// Tries /host/sys first (Docker mount), then /sys (native).
|
||||
func readTemperature(info *SystemInfo) {
|
||||
prefixes := []string{"/host/sys", "/sys"}
|
||||
|
||||
for _, prefix := range prefixes {
|
||||
if readThermalZones(prefix, info) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: try hwmon
|
||||
for _, prefix := range prefixes {
|
||||
if readHwmon(prefix, info) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func readThermalZones(sysPrefix string, info *SystemInfo) bool {
|
||||
pattern := filepath.Join(sysPrefix, "class", "thermal", "thermal_zone*", "temp")
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil || len(matches) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
sort.Strings(matches)
|
||||
|
||||
var maxTemp float64
|
||||
var maxSource string
|
||||
|
||||
for _, tempPath := range matches {
|
||||
data, err := os.ReadFile(tempPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var milliDeg int64
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(string(data)), "%d", &milliDeg); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
temp := float64(milliDeg) / 1000.0
|
||||
|
||||
// Read the type file for the label
|
||||
zoneDir := filepath.Dir(tempPath)
|
||||
typePath := filepath.Join(zoneDir, "type")
|
||||
typeData, err := os.ReadFile(typePath)
|
||||
source := strings.TrimSpace(string(typeData))
|
||||
if err != nil || source == "" {
|
||||
source = filepath.Base(zoneDir)
|
||||
}
|
||||
|
||||
if temp > maxTemp {
|
||||
maxTemp = temp
|
||||
maxSource = source
|
||||
}
|
||||
}
|
||||
|
||||
if maxTemp > 0 {
|
||||
info.TemperatureCelsius = maxTemp
|
||||
info.TemperatureSource = maxSource
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func readHwmon(sysPrefix string, info *SystemInfo) bool {
|
||||
pattern := filepath.Join(sysPrefix, "class", "hwmon", "hwmon*", "temp1_input")
|
||||
matches, err := filepath.Glob(pattern)
|
||||
if err != nil || len(matches) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
var maxTemp float64
|
||||
var maxSource string
|
||||
|
||||
for _, tempPath := range matches {
|
||||
data, err := os.ReadFile(tempPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
var milliDeg int64
|
||||
if _, err := fmt.Sscanf(strings.TrimSpace(string(data)), "%d", &milliDeg); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
temp := float64(milliDeg) / 1000.0
|
||||
source := filepath.Base(filepath.Dir(tempPath))
|
||||
|
||||
if temp > maxTemp {
|
||||
maxTemp = temp
|
||||
maxSource = source
|
||||
}
|
||||
}
|
||||
|
||||
if maxTemp > 0 {
|
||||
info.TemperatureCelsius = maxTemp
|
||||
info.TemperatureSource = maxSource
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ package system
|
||||
import "fmt"
|
||||
|
||||
// GetInfo returns empty system info on non-Linux platforms.
|
||||
func GetInfo(_ string) SystemInfo {
|
||||
func GetInfo(_ string, _ *CPUCollector) SystemInfo {
|
||||
return SystemInfo{}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user