Files
homelab-manifests/admin-system/wan-monitor.yaml
T

302 lines
11 KiB
YAML

---
# ============================================================================
# wan-monitor — internet connection quality monitoring
# Single pod (3 containers) in admin-system:
# - blackbox : prometheus blackbox-exporter (HTTP phases, ICMP, DNS) :9115
# - wan-probe : irtt (UDP quality) + iperf3 (throughput) loop -> /shared
# - metrics-http : busybox httpd serving /shared/metrics :9116
# Prometheus scrapes :9115 (blackbox relabel jobs) and :9116 (textfile metrics).
# Scrape jobs live in prometheus-wan-scrape-jobs.yaml (merge into monitoring.yaml).
# ============================================================================
apiVersion: v1
kind: ConfigMap
metadata:
name: wan-monitor-blackbox
namespace: admin-system
labels:
app: wan-monitor
data:
blackbox.yml: |
modules:
http_2xx:
prober: http
timeout: 10s
http:
preferred_ip_protocol: ip4
ip_protocol_fallback: false
method: GET
fail_if_not_ssl: false
icmp:
prober: icmp
timeout: 5s
icmp:
preferred_ip_protocol: ip4
ip_protocol_fallback: false
dns_udp:
prober: dns
timeout: 5s
dns:
transport_protocol: udp
preferred_ip_protocol: ip4
query_name: "telex.hu"
query_type: "A"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: wan-monitor-scripts
namespace: admin-system
labels:
app: wan-monitor
data:
metrics-header.prom: |
# HELP wan_irtt_rtt_seconds irtt round-trip time by statistic (seconds)
# TYPE wan_irtt_rtt_seconds gauge
# HELP wan_irtt_jitter_seconds irtt IPDV jitter mean by direction (seconds)
# TYPE wan_irtt_jitter_seconds gauge
# HELP wan_irtt_loss_ratio irtt packet loss ratio by direction (0-1)
# TYPE wan_irtt_loss_ratio gauge
# HELP wan_irtt_late_ratio irtt late/reordered packet ratio (0-1)
# TYPE wan_irtt_late_ratio gauge
# HELP wan_irtt_duplicate_ratio irtt duplicate packet ratio (0-1)
# TYPE wan_irtt_duplicate_ratio gauge
# HELP wan_irtt_packets irtt packet counters for the run
# TYPE wan_irtt_packets gauge
# HELP wan_irtt_success 1 if the irtt run produced stats
# TYPE wan_irtt_success gauge
# HELP wan_throughput_bits_per_second achieved throughput (bits/sec)
# TYPE wan_throughput_bits_per_second gauge
# HELP wan_throughput_success 1 if the throughput test succeeded
# TYPE wan_throughput_success gauge
# HELP wan_probe_last_run_timestamp_seconds unix time of last probe run
# TYPE wan_probe_last_run_timestamp_seconds gauge
irtt_to_prom.py: |
#!/usr/bin/env python3
# irtt JSON (stdin) -> Prometheus sample lines (no HELP/TYPE; header is static).
# args: <condition> <target>
import json, sys, time
cond = sys.argv[1] if len(sys.argv) > 1 else "idle"
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
L = f'target="{target}",condition="{cond}"'
ts = f'{time.time():.0f}'
try:
s = json.load(sys.stdin)["stats"]
except Exception:
print(f'wan_irtt_success{{{L}}} 0')
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
sys.exit(0)
rtt = s["rtt"]
for k in ("min", "mean", "median", "max", "stddev"):
print(f'wan_irtt_rtt_seconds{{{L},stat="{k}"}} {rtt[k]/1e9}')
print(f'wan_irtt_jitter_seconds{{{L},direction="round_trip"}} {s["ipdv_round_trip"]["mean"]/1e9}')
print(f'wan_irtt_jitter_seconds{{{L},direction="send"}} {s["ipdv_send"]["mean"]/1e9}')
print(f'wan_irtt_jitter_seconds{{{L},direction="receive"}} {s["ipdv_receive"]["mean"]/1e9}')
print(f'wan_irtt_loss_ratio{{{L},direction="round_trip"}} {s["packet_loss_percent"]/100.0}')
print(f'wan_irtt_loss_ratio{{{L},direction="upstream"}} {s["upstream_loss_percent"]/100.0}')
print(f'wan_irtt_loss_ratio{{{L},direction="downstream"}} {s["downstream_loss_percent"]/100.0}')
print(f'wan_irtt_late_ratio{{{L}}} {s["late_packets_percent"]/100.0}')
print(f'wan_irtt_duplicate_ratio{{{L}}} {s["duplicate_percent"]/100.0}')
print(f'wan_irtt_packets{{{L},kind="sent"}} {s["packets_sent"]}')
print(f'wan_irtt_packets{{{L},kind="received"}} {s["packets_received"]}')
print(f'wan_irtt_packets{{{L},kind="server_received"}} {s["server_packets_received"]}')
print(f'wan_irtt_success{{{L}}} 1')
print(f'wan_probe_last_run_timestamp_seconds{{probe="irtt",{L}}} {ts}')
tput_to_prom.py: |
#!/usr/bin/env python3
# iperf3 JSON (stdin) -> Prometheus sample lines. args: <direction> <target>
import json, sys, time
direction = sys.argv[1] if len(sys.argv) > 1 else "download"
target = sys.argv[2] if len(sys.argv) > 2 else "hetzner"
L = f'target="{target}",direction="{direction}"'
ts = f'{time.time():.0f}'
try:
bps = json.load(sys.stdin)["end"]["sum_received"]["bits_per_second"]
print(f'wan_throughput_bits_per_second{{{L}}} {bps:.0f}')
print(f'wan_throughput_success{{{L}}} 1')
except Exception:
print(f'wan_throughput_success{{{L}}} 0')
print(f'wan_probe_last_run_timestamp_seconds{{probe="throughput",{L}}} {ts}')
probe-loop.sh: |
#!/bin/sh
set -u
SHARED=/shared
HDR=/scripts/metrics-header.prom
HETZNER="${HETZNER_HOST:?set HETZNER_HOST}"
IRTT_PORT="${IRTT_PORT:-2112}"
IPERF_PORT="${IPERF_PORT:-5201}"
IRTT_INTERVAL="${IRTT_INTERVAL:-20ms}"
IRTT_DURATION="${IRTT_DURATION:-60s}"
TPUT_EVERY="${TPUT_EVERY:-900}" # seconds between throughput tests
TPUT_TIME="${TPUT_TIME:-10}" # iperf3 seconds per direction
IRTT_TARGET="${IRTT_TARGET:-hetzner}"
TPUT_TARGET="${TPUT_TARGET:-hetzner}"
HMAC_OPT=""
[ -n "${IRTT_HMAC:-}" ] && HMAC_OPT="--hmac=${IRTT_HMAC}"
mkdir -p "$SHARED"
: > "$SHARED/.irtt.prom"; : > "$SHARED/.irttload.prom"; : > "$SHARED/.tput.prom"
cp "$HDR" "$SHARED/metrics" # serve header immediately so first scrapes don't 404
assemble() {
cat "$HDR" "$SHARED/.irtt.prom" "$SHARED/.irttload.prom" "$SHARED/.tput.prom" \
> "$SHARED/.metrics.tmp" 2>/dev/null
mv "$SHARED/.metrics.tmp" "$SHARED/metrics"
}
run_irtt() { # $1 condition $2 outfile $3 duration
irtt client -i "$IRTT_INTERVAL" -d "$3" -q $HMAC_OPT -o - "${HETZNER}:${IRTT_PORT}" 2>/dev/null \
| python3 /scripts/irtt_to_prom.py "$1" "$IRTT_TARGET" > "$2"
}
run_tput() {
P="${IPERF_PARALLEL:-4}" # parallel streams: a single stream can't fill 1 Gbps over the RTT
iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" -R -J 2>/dev/null \
| python3 /scripts/tput_to_prom.py download "$TPUT_TARGET" > "$SHARED/.tput.prom"
iperf3 -c "$HETZNER" -p "$IPERF_PORT" -t "$TPUT_TIME" -P "$P" -J 2>/dev/null \
| python3 /scripts/tput_to_prom.py upload "$TPUT_TARGET" >> "$SHARED/.tput.prom"
}
last_tput=0
while true; do
run_irtt idle "$SHARED/.irtt.prom" "$IRTT_DURATION" # blocks ~IRTT_DURATION = loop cadence
assemble
now=$(date +%s)
if [ $(( now - last_tput )) -ge "$TPUT_EVERY" ]; then
LOAD_DUR=$(( 2 * TPUT_TIME + 4 ))s
run_irtt under_load "$SHARED/.irttload.prom" "$LOAD_DUR" & # concurrent = bufferbloat
LOADPID=$!
run_tput
wait "$LOADPID" 2>/dev/null
last_tput="$now"
assemble
fi
done
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: wan-monitor
namespace: admin-system
labels:
app: wan-monitor
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: wan-monitor
template:
metadata:
labels:
app: wan-monitor
annotations:
enable.version-checker.io/blackbox: "true"
enable.version-checker.io/metrics-http: "true"
enable.version-checker.io/wan-probe: "true"
match-regex.version-checker.io/blackbox: "^v[0-9]+\\.[0-9]+\\.[0-9]+$"
match-regex.version-checker.io/metrics-http: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
match-regex.version-checker.io/wan-probe: "^[0-9]+\\.[0-9]+\\.[0-9]+$"
spec:
enableServiceLinks: false
containers:
- name: blackbox
image: quay.io/prometheus/blackbox-exporter:v0.25.0
args:
- --config.file=/etc/blackbox/blackbox.yml
- --web.listen-address=:9115
ports:
- name: blackbox
containerPort: 9115
securityContext:
capabilities:
add: ["NET_RAW"] # required for the ICMP prober
resources:
requests: { cpu: 10m, memory: 32Mi }
limits: { memory: 64Mi }
volumeMounts:
- name: blackbox-config
mountPath: /etc/blackbox
readOnly: true
- name: wan-probe
# Build + push from Dockerfile.wan-probe (adjust registry/tag to taste)
image: gitea.dooplex.hu/admin/wan-probe:0.1.0
command: ["/bin/sh", "/scripts/probe-loop.sh"]
env:
- name: HETZNER_HOST
value: "jarrs.eu" # irtt + iperf3 server (your Hetzner box)
- name: IRTT_PORT
value: "2112"
- name: IPERF_PORT
value: "5201"
- name: IRTT_INTERVAL
value: "20ms"
- name: IRTT_DURATION
value: "60s"
- name: TPUT_EVERY
value: "900" # 15 min
- name: TPUT_TIME
value: "10"
- name: IPERF_PARALLEL
value: "4"
- name: IRTT_HMAC # shared key; apply via secret (see below)
valueFrom:
secretKeyRef:
name: wan-monitor-irtt
key: hmac
optional: true
resources:
requests: { cpu: 20m, memory: 48Mi }
limits: { memory: 96Mi }
volumeMounts:
- name: scripts
mountPath: /scripts
readOnly: true
- name: shared
mountPath: /shared
- name: metrics-http
image: busybox:1.36
command: ["httpd", "-f", "-v", "-p", "9116", "-h", "/shared"]
ports:
- name: metrics
containerPort: 9116
resources:
requests: { cpu: 5m, memory: 8Mi }
limits: { memory: 24Mi }
volumeMounts:
- name: shared
mountPath: /shared
readOnly: true
volumes:
- name: blackbox-config
configMap:
name: wan-monitor-blackbox
- name: scripts
configMap:
name: wan-monitor-scripts
- name: shared
emptyDir: {}
---
apiVersion: v1
kind: Service
metadata:
name: wan-monitor
namespace: admin-system
labels:
app: wan-monitor
spec:
type: ClusterIP
selector:
app: wan-monitor
ports:
- name: blackbox
port: 9115
targetPort: 9115
- name: metrics
port: 9116
targetPort: 9116