Compare commits
16 Commits
6592bfe309
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e86091f7d | |||
| 754564167f | |||
| 40f5532570 | |||
| 0a2efb86ac | |||
| b40090dec1 | |||
| 2370f005c6 | |||
| 05fa40ff5d | |||
| ef77ab9285 | |||
| e0fd669f7c | |||
| 877cda7be1 | |||
| 0887848d29 | |||
| 565c4c8bd0 | |||
| 998cd150a1 | |||
| 1a1cded065 | |||
| a66cef8a9e | |||
| d67ec2af65 |
@@ -0,0 +1,344 @@
|
|||||||
|
---
|
||||||
|
# ============================================================================
|
||||||
|
# wan-monitor — internet connection quality monitoring
|
||||||
|
# Single pod (3 containers) in admin-system:
|
||||||
|
# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115
|
||||||
|
# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared
|
||||||
|
# - metrics-http : busybox httpd serving /shared/metrics :9116
|
||||||
|
# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics).
|
||||||
|
# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml).
|
||||||
|
# ============================================================================
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: wan-monitor-blackbox
|
||||||
|
namespace: admin-system
|
||||||
|
labels:
|
||||||
|
app: wan-monitor
|
||||||
|
data:
|
||||||
|
blackbox.yml: |
|
||||||
|
modules:
|
||||||
|
http_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 10s
|
||||||
|
http:
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
ip_protocol_fallback: false
|
||||||
|
method: GET
|
||||||
|
fail_if_not_ssl: false
|
||||||
|
icmp:
|
||||||
|
prober: icmp
|
||||||
|
timeout: 5s
|
||||||
|
icmp:
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
ip_protocol_fallback: false
|
||||||
|
dns_udp:
|
||||||
|
prober: dns
|
||||||
|
timeout: 5s
|
||||||
|
dns:
|
||||||
|
transport_protocol: udp
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
query_name: "telex.hu"
|
||||||
|
query_type: "A"
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: wan-monitor-scripts
|
||||||
|
namespace: admin-system
|
||||||
|
labels:
|
||||||
|
app: wan-monitor
|
||||||
|
data:
|
||||||
|
metrics-header.prom: |
|
||||||
|
# HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds)
|
||||||
|
# TYPE wan_irtt_rtt_seconds gauge
|
||||||
|
# HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds)
|
||||||
|
# TYPE wan_irtt_jitter_seconds gauge
|
||||||
|
# HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1)
|
||||||
|
# TYPE wan_irtt_loss_ratio gauge
|
||||||
|
# HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1)
|
||||||
|
# TYPE wan_irtt_late_ratio gauge
|
||||||
|
# HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1)
|
||||||
|
# TYPE wan_irtt_duplicate_ratio gauge
|
||||||
|
# HELP wan_irtt_packets irtt packet counters for the run
|
||||||
|
# TYPE wan_irtt_packets gauge
|
||||||
|
# HELP wan_irtt_success 1 if the irtt run produced stats
|
||||||
|
# TYPE wan_irtt_success gauge
|
||||||
|
# HELP wan_throughput_bits_per_second achieved throughput (bits/sec)
|
||||||
|
# TYPE wan_throughput_bits_per_second gauge
|
||||||
|
# HELP wan_throughput_success 1 if the throughput test succeeded
|
||||||
|
# TYPE wan_throughput_success gauge
|
||||||
|
# HELP wan_probe_last_run_timestamp_seconds unix time of last probe run
|
||||||
|
# TYPE wan_probe_last_run_timestamp_seconds gauge
|
||||||
|
|
||||||
|
irtt_to_prom.py: |
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static).
|
||||||
|
# args: <condition> <target>
|
||||||
|
# Hardened: tolerates missing/null/NaN/Inf fields and clamps ratios to 0..1
|
||||||
|
# so a pathological irtt run can never emit an out-of-range or invalid sample.
|
||||||
|
import json, sys, time, math
|
||||||
|
|
||||||
|
cond = sys.argv[1] if len(sys.argv) > 1 else "idle"
|
||||||
|
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
||||||
|
L = f'target="{target}",condition="{cond}"'
|
||||||
|
ts = f'{time.time():.0f}'
|
||||||
|
|
||||||
|
def num(x, default=0.0):
|
||||||
|
# finite float or default (handles None / str / missing / NaN / Inf)
|
||||||
|
try:
|
||||||
|
v = float(x)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return default
|
||||||
|
return v if math.isfinite(v) else default
|
||||||
|
|
||||||
|
def pct_ratio(x):
|
||||||
|
# percent (0..100, possibly garbage) -> ratio clamped to 0..1
|
||||||
|
return max(0.0, min(1.0, num(x) / 100.0))
|
||||||
|
|
||||||
|
def fail():
|
||||||
|
print(f'wan_irtt_success{{{L}}} 0')
|
||||||
|
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
s = json.load(sys.stdin).get("stats")
|
||||||
|
except Exception:
|
||||||
|
fail()
|
||||||
|
if not isinstance(s, dict):
|
||||||
|
fail()
|
||||||
|
|
||||||
|
rtt = s.get("rtt") or {}
|
||||||
|
for k in ("min", "mean", "median", "max", "stddev"):
|
||||||
|
print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {num(rtt.get(k)) / 1e9}')
|
||||||
|
|
||||||
|
def ipdv(key):
|
||||||
|
d = s.get(key) or {}
|
||||||
|
return num(d.get("mean")) / 1e9
|
||||||
|
|
||||||
|
print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {ipdv("ipdv_round_trip")}')
|
||||||
|
print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {ipdv("ipdv_send")}')
|
||||||
|
print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {ipdv("ipdv_receive")}')
|
||||||
|
|
||||||
|
print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {pct_ratio(s.get("packet_loss_percent"))}')
|
||||||
|
print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {pct_ratio(s.get("upstream_loss_percent"))}')
|
||||||
|
print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {pct_ratio(s.get("downstream_loss_percent"))}')
|
||||||
|
print(f'wan_irtt_late_ratio{{{L}}} {pct_ratio(s.get("late_packets_percent"))}')
|
||||||
|
print(f'wan_irtt_duplicate_ratio{{{L}}} {pct_ratio(s.get("duplicate_percent"))}')
|
||||||
|
|
||||||
|
print(f'wan_irtt_packets{{{L},kind="sent"}} {int(num(s.get("packets_sent")))}')
|
||||||
|
print(f'wan_irtt_packets{{{L},kind="received"}} {int(num(s.get("packets_received")))}')
|
||||||
|
print(f'wan_irtt_packets{{{L},kind="server_received"}} {int(num(s.get("server_packets_received")))}')
|
||||||
|
|
||||||
|
print(f'wan_irtt_success{{{L}}} 1')
|
||||||
|
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
|
||||||
|
|
||||||
|
tput_to_prom.py: |
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# iperf3 JSON (stdin) -> Prometheus sample lines. args: <direction> <target>
|
||||||
|
import json, sys, time
|
||||||
|
direction = sys.argv[1] if len(sys.argv) > 1 else "download"
|
||||||
|
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
|
||||||
|
L = f'target="{target}",direction="{direction}"'
|
||||||
|
ts = f'{time.time():.0f}'
|
||||||
|
try:
|
||||||
|
bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"]
|
||||||
|
print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}')
|
||||||
|
print(f'wan_throughput_success{{{L}}} 1')
|
||||||
|
except Exception:
|
||||||
|
print(f'wan_throughput_success{{{L}}} 0')
|
||||||
|
print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}')
|
||||||
|
|
||||||
|
probe-loop.sh: |
|
||||||
|
#!/bin/sh
|
||||||
|
set -u
|
||||||
|
SHARED=/shared
|
||||||
|
HDR=/scripts/metrics-header.prom
|
||||||
|
HETZNER="${HETZNER_HOST:?set HETZNER_HOST}"
|
||||||
|
IRTT_PORT="${IRTT_PORT:-2112}"
|
||||||
|
IPERF_PORT="${IPERF_PORT:-5201}"
|
||||||
|
IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}"
|
||||||
|
IRTT_DURATION="${IRTT_DURATION:-60}" # seconds (numeric, for timeout math)
|
||||||
|
TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests
|
||||||
|
TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction
|
||||||
|
IRTT_TARGET="${IRTT_TARGET:-hetzner}"
|
||||||
|
TPUT_TARGET="${TPUT_TARGET:-hetzner}"
|
||||||
|
HMAC_OPT=""
|
||||||
|
[ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}"
|
||||||
|
|
||||||
|
mkdir -p "$SHARED"
|
||||||
|
: > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom"
|
||||||
|
cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404
|
||||||
|
|
||||||
|
# Concatenate fragments into the served file via temp + atomic rename.
|
||||||
|
assemble() {
|
||||||
|
cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \
|
||||||
|
> "$SHARED/.metrics.tmp" 2>/dev/null
|
||||||
|
mv "$SHARED/.metrics.tmp" "$SHARED/metrics"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Each fragment is written to <file>.tmp then renamed, so assemble() never
|
||||||
|
# cats a partially written file (the cause of the impossible loss spikes).
|
||||||
|
run_irtt() { # $1 condition $2 outfile $3 duration(seconds)
|
||||||
|
timeout "$(( $3 + 25 ))" irtt client -i "$IRTT_INTERVAL" -d "${3}s" -q $HMAC_OPT \
|
||||||
|
-o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \
|
||||||
|
| python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2.tmp"
|
||||||
|
mv "$2.tmp" "$2"
|
||||||
|
}
|
||||||
|
|
||||||
|
run_tput() {
|
||||||
|
P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill the pipe over the RTT
|
||||||
|
TO="$(( TPUT_TIME + 20 ))"
|
||||||
|
TMP="$SHARED/.tput.prom.partial"
|
||||||
|
: > "$TMP"
|
||||||
|
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -R -J 2>/dev/null \
|
||||||
|
| python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$TMP"
|
||||||
|
timeout "$TO" iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" --connect-timeout 5000 -J 2>/dev/null \
|
||||||
|
| python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$TMP"
|
||||||
|
mv "$TMP" "$SHARED/.tput.prom"
|
||||||
|
}
|
||||||
|
|
||||||
|
last_tput=0
|
||||||
|
while true; do
|
||||||
|
run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence
|
||||||
|
assemble
|
||||||
|
now=$(date +%s)
|
||||||
|
if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then
|
||||||
|
LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))
|
||||||
|
run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat
|
||||||
|
LOADPID=$!
|
||||||
|
run_tput
|
||||||
|
wait "$LOADPID" 2>/dev/null
|
||||||
|
last_tput="$now"
|
||||||
|
assemble
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: wan-monitor
|
||||||
|
namespace: admin-system
|
||||||
|
labels:
|
||||||
|
app: wan-monitor
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: wan-monitor
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: wan-monitor
|
||||||
|
annotations:
|
||||||
|
enable.version-checker.io/blackbox: "true"
|
||||||
|
enable.version-checker.io/metrics-http: "true"
|
||||||
|
enable.version-checker.io/wan-probe: "true"
|
||||||
|
match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||||
|
match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||||
|
match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
|
||||||
|
spec:
|
||||||
|
enableServiceLinks: false
|
||||||
|
containers:
|
||||||
|
- name: blackbox
|
||||||
|
image: quay.io/prometheus/blackbox-exporter:v0.28.0
|
||||||
|
args:
|
||||||
|
- --config.file=/etc/blackbox/blackbox.yml
|
||||||
|
- --web.listen-address=:9115
|
||||||
|
ports:
|
||||||
|
- name: blackbox
|
||||||
|
containerPort: 9115
|
||||||
|
securityContext:
|
||||||
|
capabilities:
|
||||||
|
add: ["NET_RAW"] # required for the ICMP prober
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 10m, memory: 32Mi }
|
||||||
|
limits: { memory: 64Mi }
|
||||||
|
volumeMounts:
|
||||||
|
- name: blackbox-config
|
||||||
|
mountPath: /etc/blackbox
|
||||||
|
readOnly: true
|
||||||
|
|
||||||
|
- name: wan-probe
|
||||||
|
# Build + push from Dockerfile.wan-probe (adjust registry/tag to taste)
|
||||||
|
image: gitea.dooplex.hu/admin/wan-probe:0.1.0
|
||||||
|
command: ["/bin/sh", "/scripts/probe-loop.sh"]
|
||||||
|
env:
|
||||||
|
- name: HETZNER_HOST
|
||||||
|
# MUST be the Hetzner origin: a DNS-only (grey-cloud) record or raw IP.
|
||||||
|
# NOT the Cloudflare-proxied jarrs.eu — CF only forwards HTTP/HTTPS, so
|
||||||
|
# UDP 2112 (irtt) / TCP 5201 (iperf3) never reach the origin behind it.
|
||||||
|
value: "metrics.jarrs.eu" # DNS-only A record -> Hetzner IPv4
|
||||||
|
- name: IRTT_PORT
|
||||||
|
value: "2112"
|
||||||
|
- name: IPERF_PORT
|
||||||
|
value: "5201"
|
||||||
|
- name: IRTT_INTERVAL
|
||||||
|
value: "20ms"
|
||||||
|
- name: IRTT_DURATION
|
||||||
|
value: "60" # seconds (numeric)
|
||||||
|
- name: TPUT_EVERY
|
||||||
|
value: "900" # 15 min
|
||||||
|
- name: TPUT_TIME
|
||||||
|
value: "10"
|
||||||
|
- name: IPERF_PARALLEL
|
||||||
|
value: "4"
|
||||||
|
- name: IRTT_HMAC # shared key; apply via secret (see below)
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: wan-monitor-irtt
|
||||||
|
key: hmac
|
||||||
|
optional: true
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 20m, memory: 48Mi }
|
||||||
|
limits: { memory: 96Mi }
|
||||||
|
volumeMounts:
|
||||||
|
- name: scripts
|
||||||
|
mountPath: /scripts
|
||||||
|
readOnly: true
|
||||||
|
- name: shared
|
||||||
|
mountPath: /shared
|
||||||
|
|
||||||
|
- name: metrics-http
|
||||||
|
image: busybox:1.36
|
||||||
|
command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"]
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
containerPort: 9116
|
||||||
|
resources:
|
||||||
|
requests: { cpu: 5m, memory: 8Mi }
|
||||||
|
limits: { memory: 24Mi }
|
||||||
|
volumeMounts:
|
||||||
|
- name: shared
|
||||||
|
mountPath: /shared
|
||||||
|
readOnly: true
|
||||||
|
volumes:
|
||||||
|
- name: blackbox-config
|
||||||
|
configMap:
|
||||||
|
name: wan-monitor-blackbox
|
||||||
|
- name: scripts
|
||||||
|
configMap:
|
||||||
|
name: wan-monitor-scripts
|
||||||
|
- name: shared
|
||||||
|
emptyDir: {}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: wan-monitor
|
||||||
|
namespace: admin-system
|
||||||
|
labels:
|
||||||
|
app: wan-monitor
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
selector:
|
||||||
|
app: wan-monitor
|
||||||
|
ports:
|
||||||
|
- name: blackbox
|
||||||
|
port: 9115
|
||||||
|
targetPort: 9115
|
||||||
|
- name: metrics
|
||||||
|
port: 9116
|
||||||
|
targetPort: 9116
|
||||||
@@ -237,6 +237,75 @@ data:
|
|||||||
regex: 'ak-outpost-(.*)-outpost'
|
regex: 'ak-outpost-(.*)-outpost'
|
||||||
replacement: '$1'
|
replacement: '$1'
|
||||||
|
|
||||||
|
# --- end-to-end latency + loss (ICMP) to many destinations ---
|
||||||
|
- job_name: 'wan-icmp'
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [icmp]
|
||||||
|
scrape_interval: 15s
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 8.8.8.8
|
||||||
|
- 1.1.1.1
|
||||||
|
- jarrs.eu # Hetzner
|
||||||
|
- telex.hu
|
||||||
|
- store.steampowered.com
|
||||||
|
- 192.168.0.1 # gateway
|
||||||
|
- 37.191.56.193 # your public IP (update if it changes)
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: wan-monitor.admin-system:9115
|
||||||
|
|
||||||
|
# --- HTTP phase breakdown (dns/connect/tls/processing/transfer) ---
|
||||||
|
- job_name: 'wan-http'
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [http_2xx]
|
||||||
|
scrape_interval: 30s
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- https://telex.hu
|
||||||
|
- https://store.steampowered.com
|
||||||
|
- https://jarrs.eu
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: wan-monitor.admin-system:9115
|
||||||
|
|
||||||
|
# --- DNS resolution time per resolver (Pi-hole vs public) ---
|
||||||
|
- job_name: 'wan-dns'
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [dns_udp]
|
||||||
|
scrape_interval: 30s
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 192.168.0.250 # Pi-hole
|
||||||
|
- 1.1.1.1
|
||||||
|
- 8.8.8.8
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: wan-monitor.admin-system:9115
|
||||||
|
|
||||||
|
# --- irtt (UDP quality) + iperf3 (throughput) textfile metrics ---
|
||||||
|
- job_name: 'wan-probe'
|
||||||
|
metrics_path: /metrics
|
||||||
|
scrape_interval: 30s
|
||||||
|
fallback_scrape_protocol: PrometheusText0.0.4
|
||||||
|
static_configs:
|
||||||
|
- targets: ['wan-monitor.admin-system:9116']
|
||||||
|
|
||||||
# CloudNativePG - Postgres metrics per instance
|
# CloudNativePG - Postgres metrics per instance
|
||||||
- job_name: 'cloudnativepg'
|
- job_name: 'cloudnativepg'
|
||||||
kubernetes_sd_configs:
|
kubernetes_sd_configs:
|
||||||
@@ -621,7 +690,7 @@ spec:
|
|||||||
memory: 128Mi
|
memory: 128Mi
|
||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 768Mi
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: data
|
- name: data
|
||||||
mountPath: /var/lib/grafana
|
mountPath: /var/lib/grafana
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: prometheus
|
app: prometheus
|
||||||
data:
|
data:
|
||||||
|
|
||||||
authentik-alerts.yml: |
|
authentik-alerts.yml: |
|
||||||
groups:
|
groups:
|
||||||
- name: authentik-availability
|
- name: authentik-availability
|
||||||
@@ -210,3 +210,104 @@ data:
|
|||||||
annotations:
|
annotations:
|
||||||
summary: "Longhorn node {{ $labels.node }} storage pressure"
|
summary: "Longhorn node {{ $labels.node }} storage pressure"
|
||||||
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
|
description: "Node {{ $labels.node }} disk usage is at {{ printf \"%.1f\" $value }}%."
|
||||||
|
|
||||||
|
# Add this as a new data key (wan-alerts.yml) in the existing
|
||||||
|
# prometheus-rules ConfigMap (mon-system). Thresholds anchored to One.hu's
|
||||||
|
# "normal conditions" figures: 700 Mbit/s down / 28 Mbit/s up.
|
||||||
|
# Throughput is sampled every ~15 min, so `for:` spans >=2 samples to avoid
|
||||||
|
# firing on a single fluke. Recalibrate floors after a week of baseline data.
|
||||||
|
# NOTE: uses Prometheus template funcs (humanize/humanizePercentage/humanizeDuration);
|
||||||
|
# mul/div are NOT valid Prometheus template functions.
|
||||||
|
wan-alerts.yml: |
|
||||||
|
groups:
|
||||||
|
- name: wan-quality-alerts
|
||||||
|
rules:
|
||||||
|
# --- upstream loss: the prime suspect for dropped calls / WireGuard ---
|
||||||
|
- alert: WanUpstreamPacketLoss
|
||||||
|
expr: wan_irtt_loss_ratio{direction="upstream",condition="idle"} > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN upstream packet loss to {{ $labels.target }}"
|
||||||
|
description: "irtt upstream loss {{ $value | humanizePercentage }} (>1%) for 2m. Cable-upstream symptom; capture for ISP."
|
||||||
|
|
||||||
|
- alert: WanDownstreamPacketLoss
|
||||||
|
expr: wan_irtt_loss_ratio{direction="downstream",condition="idle"} > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN downstream packet loss to {{ $labels.target }}"
|
||||||
|
description: "irtt downstream loss {{ $value | humanizePercentage }} (>1%) for 2m."
|
||||||
|
|
||||||
|
# --- latency / jitter ---
|
||||||
|
- alert: WanLatencyHigh
|
||||||
|
expr: wan_irtt_rtt_seconds{stat="max",condition="idle"} > 0.08
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN RTT spikes to {{ $labels.target }}"
|
||||||
|
description: "irtt max RTT {{ $value | humanizeDuration }} (>80 ms) for 5m (idle). Real-time apps will feel this."
|
||||||
|
|
||||||
|
- alert: WanJitterHigh
|
||||||
|
expr: wan_irtt_jitter_seconds{direction="round_trip",condition="idle"} > 0.03
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN jitter high to {{ $labels.target }}"
|
||||||
|
description: "Round-trip jitter {{ $value | humanizeDuration }} (>30 ms) for 5m. Degrades VoIP/video."
|
||||||
|
|
||||||
|
# --- bufferbloat: latency added while the line is saturated ---
|
||||||
|
- alert: WanBufferbloat
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
wan_irtt_rtt_seconds{stat="mean",condition="under_load"}
|
||||||
|
- on(target) wan_irtt_rtt_seconds{stat="mean",condition="idle"}
|
||||||
|
) > 0.1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: "WAN bufferbloat on {{ $labels.target }}"
|
||||||
|
description: "RTT rises {{ $value | humanizeDuration }} under load (>100 ms). Line buckles when saturated."
|
||||||
|
|
||||||
|
# --- throughput vs One.hu "normal" 700/28 (alert below 50%) ---
|
||||||
|
- alert: WanDownloadDegraded
|
||||||
|
expr: wan_throughput_bits_per_second{direction="download"} < 350e6
|
||||||
|
for: 20m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN download below half of plan"
|
||||||
|
description: "Download {{ $value | humanize }}bit/s (< 350M, half of 700 normal) for 20m."
|
||||||
|
|
||||||
|
- alert: WanUploadDegraded
|
||||||
|
expr: wan_throughput_bits_per_second{direction="upload"} < 14e6
|
||||||
|
for: 20m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN upload below half of plan"
|
||||||
|
description: "Upload {{ $value | humanize }}bit/s (< 14M, half of 28 normal) for 20m."
|
||||||
|
|
||||||
|
# --- the monitor itself stopped producing data ---
|
||||||
|
- alert: WanProbeStalled
|
||||||
|
expr: time() - max by(probe) (wan_probe_last_run_timestamp_seconds) > 300
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN probe '{{ $labels.probe }}' stalled"
|
||||||
|
description: "No fresh samples for >5 min. Check the wan-monitor pod / Hetzner endpoint."
|
||||||
|
|
||||||
|
- alert: WanBlackboxTargetDown
|
||||||
|
expr: probe_success{job=~"wan-.*"} == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "WAN probe to {{ $labels.instance }} failing"
|
||||||
|
description: "blackbox {{ $labels.job }} to {{ $labels.instance }} unreachable for 3m."
|
||||||
@@ -10,7 +10,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/instance: outline
|
app.kubernetes.io/instance: outline
|
||||||
app.kubernetes.io/name: outline
|
app.kubernetes.io/name: outline
|
||||||
app.kubernetes.io/version: 1.1.0
|
app.kubernetes.io/version: 1.8.1
|
||||||
name: outline
|
name: outline
|
||||||
namespace: outline-system
|
namespace: outline-system
|
||||||
spec:
|
spec:
|
||||||
@@ -31,7 +31,7 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: outline
|
- name: outline
|
||||||
image: outlinewiki/outline:1.8.0
|
image: outlinewiki/outline:1.8.1
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
env:
|
env:
|
||||||
- name: NODE_ENV
|
- name: NODE_ENV
|
||||||
@@ -331,7 +331,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/instance: outline
|
app.kubernetes.io/instance: outline
|
||||||
app.kubernetes.io/name: outline
|
app.kubernetes.io/name: outline
|
||||||
app.kubernetes.io/version: 1.1.0
|
app.kubernetes.io/version: 1.8.1
|
||||||
name: outline
|
name: outline
|
||||||
namespace: outline-system
|
namespace: outline-system
|
||||||
spec:
|
spec:
|
||||||
|
|||||||
@@ -904,7 +904,14 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: seerr
|
- name: seerr
|
||||||
image: docker.io/fallenbagel/jellyseerr:preview-OIDC
|
# 2026-06-06: migrating from fallenbagel/jellyseerr:preview-OIDC
|
||||||
|
# (a custom OIDC-capable build) to seerr-team/seerr v3.x — the
|
||||||
|
# successor project (combined Overseerr+Jellyseerr team rebrand
|
||||||
|
# from v3.0.0). Mainline now has native OIDC support so we don't
|
||||||
|
# need the custom build. Migration is auto on first start; backed
|
||||||
|
# up the config PVC to ~/seerr-backups on dooplex before this PR.
|
||||||
|
# https://docs.seerr.dev/migration-guide
|
||||||
|
image: ghcr.io/seerr-team/seerr:v3.3.0
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
env:
|
env:
|
||||||
- name: TZ
|
- name: TZ
|
||||||
|
|||||||
Reference in New Issue
Block a user